diff --git a/CMakeLists.txt b/CMakeLists.txt index d460f1f7d..6a846ecd0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -319,7 +319,7 @@ if (ENABLE_LIBUSB) endif() if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) - find_package(xbyak 6 CONFIG) + find_package(xbyak 7 CONFIG) endif() if (ARCHITECTURE_arm64) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index d49a2e43e..9d77d53b3 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -314,3 +314,10 @@ endif() if (NOT TARGET SimpleIni::SimpleIni) add_subdirectory(simpleini) endif() + +# sse2neon +if (ARCHITECTURE_arm64 AND NOT TARGET sse2neon) + add_library(sse2neon INTERFACE) + target_include_directories(sse2neon INTERFACE sse2neon) +endif() + diff --git a/externals/sse2neon/sse2neon.h b/externals/sse2neon/sse2neon.h new file mode 100755 index 000000000..66b93c1c7 --- /dev/null +++ b/externals/sse2neon/sse2neon.h @@ -0,0 +1,9285 @@ +// SPDX-FileCopyrightText: Copyright 2015-2024 SSE2NEON Contributors +// SPDX-License-Identifier: MIT + +#ifndef SSE2NEON_H +#define SSE2NEON_H + +/* + * sse2neon is freely redistributable under the MIT License. + * + * Copyright (c) 2015-2024 SSE2NEON Contributors. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions +// +// Contributors to this work are: +// John W. Ratcliff +// Brandon Rowlett +// Ken Fast +// Eric van Beurden +// Alexander Potylitsin +// Hasindu Gamaarachchi +// Jim Huang +// Mark Cheng +// Malcolm James MacLeod +// Devin Hussey (easyaspi314) +// Sebastian Pop +// Developer Ecosystem Engineering +// Danila Kutenin +// François Turban (JishinMaster) +// Pei-Hsuan Hung +// Yang-Hao Yuan +// Syoyo Fujita +// Brecht Van Lommel +// Jonathan Hue +// Cuda Chen +// Aymen Qader +// Anthony Roberts + +/* Tunable configurations */ + +/* Enable precise implementation of math operations + * This would slow down the computation a bit, but gives consistent result with + * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) + */ +/* _mm_min|max_ps|ss|pd|sd */ +#ifndef SSE2NEON_PRECISE_MINMAX +#define SSE2NEON_PRECISE_MINMAX (0) +#endif +/* _mm_rcp_ps and _mm_div_ps */ +#ifndef SSE2NEON_PRECISE_DIV +#define SSE2NEON_PRECISE_DIV (0) +#endif +/* _mm_sqrt_ps and _mm_rsqrt_ps */ +#ifndef SSE2NEON_PRECISE_SQRT +#define SSE2NEON_PRECISE_SQRT (0) +#endif +/* _mm_dp_pd */ +#ifndef SSE2NEON_PRECISE_DP +#define SSE2NEON_PRECISE_DP (0) +#endif + +/* Enable inclusion of windows.h on MSVC platforms + * This makes _mm_clflush functional on windows, as there is no builtin. + */ +#ifndef SSE2NEON_INCLUDE_WINDOWS_H +#define SSE2NEON_INCLUDE_WINDOWS_H (0) +#endif + +/* compiler specific definitions */ +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#define _sse2neon_likely(x) __builtin_expect(!!(x), 1) +#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) +#elif defined(_MSC_VER) +#if _MSVC_TRADITIONAL +#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead. +#endif +#ifndef FORCE_INLINE +#define FORCE_INLINE static inline +#endif +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif +#define _sse2neon_likely(x) (x) +#define _sse2neon_unlikely(x) (x) +#else +#pragma message("Macro name collisions may happen with unsupported compilers.") +#endif + +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 +#warning "GCC versions earlier than 10 are not supported." +#endif + +/* C language does not allow initializing a variable with a function call. */ +#ifdef __cplusplus +#define _sse2neon_const static const +#else +#define _sse2neon_const const +#endif + +#include +#include + +#if defined(_WIN32) +/* Definitions for _mm_{malloc,free} are provided by + * from both MinGW-w64 and MSVC. + */ +#define SSE2NEON_ALLOC_DEFINED +#endif + +/* If using MSVC */ +#ifdef _MSC_VER +#include +#if SSE2NEON_INCLUDE_WINDOWS_H +#include +#include +#endif + +#if !defined(__cplusplus) +#error SSE2NEON only supports C++ compilation with this compiler +#endif + +#ifdef SSE2NEON_ALLOC_DEFINED +#include +#endif + +#if (defined(_M_AMD64) || defined(__x86_64__)) || \ + (defined(_M_ARM64) || defined(__arm64__)) +#define SSE2NEON_HAS_BITSCAN64 +#endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define _sse2neon_define0(type, s, body) \ + __extension__({ \ + type _a = (s); \ + body \ + }) +#define _sse2neon_define1(type, s, body) \ + __extension__({ \ + type _a = (s); \ + body \ + }) +#define _sse2neon_define2(type, a, b, body) \ + __extension__({ \ + type _a = (a), _b = (b); \ + body \ + }) +#define _sse2neon_return(ret) (ret) +#else +#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a) +#define _sse2neon_define1(type, a, body) [](type _a) { body }(a) +#define _sse2neon_define2(type, a, b, body) \ + [](type _a, type _b) { body }((a), (b)) +#define _sse2neon_return(ret) return ret +#endif + +#define _sse2neon_init(...) \ + { \ + __VA_ARGS__ \ + } + +/* Compiler barrier */ +#if defined(_MSC_VER) +#define SSE2NEON_BARRIER() _ReadWriteBarrier() +#else +#define SSE2NEON_BARRIER() \ + do { \ + __asm__ __volatile__("" ::: "memory"); \ + (void) 0; \ + } while (0) +#endif + +/* Memory barriers + * __atomic_thread_fence does not include a compiler barrier; instead, + * the barrier is part of __atomic_load/__atomic_store's "volatile-like" + * semantics. + */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) +#include +#endif + +FORCE_INLINE void _sse2neon_smp_mb(void) +{ + SSE2NEON_BARRIER(); +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(memory_order_seq_cst); +#elif defined(__GNUC__) || defined(__clang__) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#else /* MSVC */ + __dmb(_ARM64_BARRIER_ISH); +#endif +} + +/* Architecture-specific build options */ +/* FIXME: #pragma GCC push_options is only available on GCC */ +#if defined(__GNUC__) +#if defined(__arm__) && __ARM_ARCH == 7 +/* According to ARM C Language Extensions Architecture specification, + * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) + * architecture supported. + */ +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." +#endif +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("fpu=neon") +#endif +#elif defined(__aarch64__) || defined(_M_ARM64) +#if !defined(__clang__) && !defined(_MSC_VER) +#pragma GCC push_options +#pragma GCC target("+simd") +#endif +#elif __ARM_ARCH == 8 +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error \ + "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON." +#endif +#if !defined(__clang__) && !defined(_MSC_VER) +#pragma GCC push_options +#endif +#else +#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#endif +#endif + +#include +#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8) +#if defined __has_include && __has_include() +#include +#endif +#endif + +/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD + * and other Arm microarchitectures use. + * From sysctl -a on Apple M1: + * hw.cachelinesize: 128 + */ +#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) +#define SSE2NEON_CACHELINE_SIZE 128 +#else +#define SSE2NEON_CACHELINE_SIZE 64 +#endif + +/* Rounding functions require either Aarch64 instructions or libm fallback */ +#if !defined(__aarch64__) && !defined(_M_ARM64) +#include +#endif + +/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only + * or even not accessible in user mode. + * To write or access to these registers in user mode, + * we have to perform syscall instead. + */ +#if (!defined(__aarch64__) && !defined(_M_ARM64)) +#include +#endif + +/* "__has_builtin" can be used to query support for built-in functions + * provided by gcc/clang and other compilers that support it. + */ +#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ +/* Compatibility with gcc <= 9 */ +#if defined(__GNUC__) && (__GNUC__ <= 9) +#define __has_builtin(x) HAS##x +#define HAS__builtin_popcount 1 +#define HAS__builtin_popcountll 1 + +// __builtin_shuffle introduced in GCC 4.7.0 +#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) +#define HAS__builtin_shuffle 1 +#else +#define HAS__builtin_shuffle 0 +#endif + +#define HAS__builtin_shufflevector 0 +#define HAS__builtin_nontemporal_store 0 +#else +#define __has_builtin(x) 0 +#endif +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +#if __has_builtin(__builtin_shufflevector) +#define _sse2neon_shuffle(type, a, b, ...) \ + __builtin_shufflevector(a, b, __VA_ARGS__) +#elif __has_builtin(__builtin_shuffle) +#define _sse2neon_shuffle(type, a, b, ...) \ + __extension__({ \ + type tmp = {__VA_ARGS__}; \ + __builtin_shuffle(a, b, tmp); \ + }) +#endif + +#ifdef _sse2neon_shuffle +#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__) +#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__) +#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__) +#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__) +#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__) +#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__) +#endif + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 +/* Flush zero mode macros. */ +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 +/* Denormals are zeros mode macros. */ +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#define _MM_DENORMALS_ZERO_ON 0x0040 +#define _MM_DENORMALS_ZERO_OFF 0x0000 + +/* indicate immediate constant argument in a given range */ +#define __constrange(a, b) const + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef int64x1_t __m64; +typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ +// On ARM 32-bit architecture, the float64x2_t is not supported. +// The data type __m128d should be represented in a different way for related +// intrinsic conversion. +#if defined(__aarch64__) || defined(_M_ARM64) +typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ +#else +typedef float32x4_t __m128d; +#endif +typedef int64x2_t __m128i; /* 128-bit vector containing integers */ + +// __int64 is defined in the Intrinsics Guide which maps to different datatype +// in different data model +#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) +#if (defined(__x86_64__) || defined(__i386__)) +#define __int64 long long +#else +#define __int64 int64_t +#endif +#endif + +/* type-safe casting between types */ + +#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) + +#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) +#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) +#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) +#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) +#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) + +#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) +#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) +#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) +#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) +#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) +#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_m128i_s64(x) (x) + +#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) + +#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) +#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) +#define vreinterpretq_s64_m128i(x) (x) + +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) + +#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) +#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) +#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) +#define vreinterpret_m64_s64(x) (x) + +#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) +#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) +#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) +#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) + +#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) +#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) +#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) + +#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) +#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) +#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) +#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) + +#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) +#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) +#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) +#define vreinterpret_s64_m64(x) (x) + +#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) + +#if defined(__aarch64__) || defined(_M_ARM64) +#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) + +#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) + +#define vreinterpretq_f64_m128d(x) (x) +#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) +#else +#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128d_f32(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_f32_m128d(x) (x) +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an __m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://learn.microsoft.com/en-us/cpp/cpp/m128 +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +// casting using SIMDVec +#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) + +/* SSE macros */ +#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode +#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode +#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode +#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode + +// Function declaration +// SSE +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); +FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); +FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); +FORCE_INLINE __m128 _mm_set_ps1(float); +FORCE_INLINE __m128 _mm_setzero_ps(void); +// SSE2 +FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); +FORCE_INLINE __m128i _mm_castps_si128(__m128); +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); +FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); +FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); +FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); +FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); +FORCE_INLINE __m128d _mm_set_pd(double, double); +FORCE_INLINE __m128i _mm_set1_epi32(int); +FORCE_INLINE __m128i _mm_setzero_si128(void); +// SSE4.1 +FORCE_INLINE __m128d _mm_ceil_pd(__m128d); +FORCE_INLINE __m128 _mm_ceil_ps(__m128); +FORCE_INLINE __m128d _mm_floor_pd(__m128d); +FORCE_INLINE __m128 _mm_floor_ps(__m128); +FORCE_INLINE __m128d _mm_round_pd(__m128d, int); +FORCE_INLINE __m128 _mm_round_ps(__m128, int); +// SSE4.2 +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); + +/* Backwards compatibility for compilers with lack of specific type support */ + +// Older gcc does not define vld1q_u8_x4 type +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ <= 13 && defined(__arm__)) || \ + (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ + (__GNUC__ <= 9 && defined(__aarch64__))) +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} +#else +// Wraps vld1q_u8_x4 +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + return vld1q_u8_x4(p); +} +#endif + +#if !defined(__aarch64__) && !defined(_M_ARM64) +/* emulate vaddv u8 variant */ +FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) +{ + const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8))); + return vget_lane_u8(vreinterpret_u8_u64(v1), 0); +} +#else +// Wraps vaddv_u8 +FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) +{ + return vaddv_u8(v8); +} +#endif + +#if !defined(__aarch64__) && !defined(_M_ARM64) +/* emulate vaddvq u8 variant */ +FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) +{ + uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); + uint8_t res = 0; + for (int i = 0; i < 8; ++i) + res += tmp[i]; + return res; +} +#else +// Wraps vaddvq_u8 +FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) +{ + return vaddvq_u8(a); +} +#endif + +#if !defined(__aarch64__) && !defined(_M_ARM64) +/* emulate vaddvq u16 variant */ +FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) +{ + uint32x4_t m = vpaddlq_u16(a); + uint64x2_t n = vpaddlq_u32(m); + uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); + + return vget_lane_u32((uint32x2_t) o, 0); +} +#else +// Wraps vaddvq_u16 +FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) +{ + return vaddvq_u16(a); +} +#endif + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm__ + * + * The parts of this format are given as follows: + * 1. describes the operation performed by the intrinsic + * 2. identifies the data type of the function's primary arguments + * + * This last part, , is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors contain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ +}; + +// The bit field mapping to the FPCR(floating-point control register) +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t bit24 : 1; + uint8_t res2 : 7; +#if defined(__aarch64__) || defined(_M_ARM64) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + float32x2_t a33 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// For MSVC, we check only if it is ARM64, as every single ARM64 processor +// supported by WoA has crypto extensions. If this changes in the future, +// this can be verified via the runtime-only method of: +// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) +#if (defined(_M_ARM64) && !defined(__clang__)) || \ + (defined(__ARM_FEATURE_CRYPTO) && \ + (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); +#if defined(_MSC_VER) + __n64 a1 = {a}, b1 = {b}; + return vreinterpretq_u64_p128(vmull_p64(a1, b1)); +#else + return vreinterpretq_u64_p128(vmull_p64(a, b)); +#endif +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. +// +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. +// +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} +#endif // ARMv7 polyfill + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + vreinterpretq_m128i_s32(vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ + vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ + vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \ + ((imm) >> 2) & 0x3), \ + vmovq_n_s32(vgetq_lane_s32( \ + vreinterpretq_s32_m128i(a), (imm) & (0x3))), \ + 1), \ + 2), \ + 3)) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); +} + +// rotates the least significant 32 bits into the most significant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} + +// rotates the most significant 32 bits into the least significant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) +{ + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) +{ + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +{ + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +#if defined(__aarch64__) || defined(_M_ARM64) +#define _mm_shuffle_epi32_splat(a, imm) \ + vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))) +#else +#define _mm_shuffle_epi32_splat(a, imm) \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))) +#endif + +// NEON does not support a general purpose permute intrinsic. +// Shuffle single-precision (32-bit) floating-point elements in a using the +// control in imm8, and store the results in dst. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps +#define _mm_shuffle_ps_default(a, b, imm) \ + vreinterpretq_m128_f32(vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ + vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ + vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ + vmovq_n_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \ + 1), \ + 2), \ + 3)) + +// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. +// Store the results in the low 64 bits of dst, with the high 64 bits being +// copied from a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16 +#define _mm_shufflelo_epi16_function(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ + 3); \ + _sse2neon_return(vreinterpretq_m128i_s16(ret));) + +// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. +// Store the results in the high 64 bits of dst, with the low 64 bits being +// copied from a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16 +#define _mm_shufflehi_epi16_function(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ + 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ + 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ + 7); \ + _sse2neon_return(vreinterpretq_m128i_s16(ret));) + +/* MMX */ + +//_mm_empty is a no-op on arm +FORCE_INLINE void _mm_empty(void) {} + +/* SSE */ + +// Add packed single-precision (32-bit) floating-point elements in a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Add the lower single-precision (32-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of . + return vreinterpretq_m128_f32(vaddq_f32(a, value)); +} + +// Compute the bitwise AND of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise NOT of packed single-precision (32-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmple_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-greater-than, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-less-than-or-equal, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-less-than, store the result in the lower element of dst, and copy +// the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// to see if neither is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps +// +// See also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b to see if neither is NaN, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// to see if either is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +{ + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b to see if either is NaN, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_eq_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for greater-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_ge_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for greater-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_gt_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for less-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_le_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for less-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_lt_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for not-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + return !_mm_comieq_ss(a, b); +} + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vreinterpret_m64_s32( + vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); +#else + return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION))))); +#endif +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), + 0); +#else + float32_t data = vgetq_lane_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); + return (int32_t) data; +#endif +} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); +} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then convert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16 +FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) +{ + return vreinterpret_m64_s16( + vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32 +#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 8-bit integers, and store the results in lower 4 elements of dst. +// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values +// between 0x7F and 0x7FFFFFFF. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8 +FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) +{ + return vreinterpret_m64_s8(vqmovn_s16( + vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0)))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_u32( + vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss +#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss +FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32 +#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64 +FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); + return (int64_t) data; +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi +FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) +{ + return vreinterpret_m64_s32( + vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si +FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) +{ + return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32 +#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32 +#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64 +FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) +{ + return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Divide packed single-precision (32-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement +// division by multiplying a by b's reciprocal before using the Newton-Raphson +// method to approximate the results. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_DIV + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); +#endif +} + +// Divide the lower single-precision (32-bit) floating-point element in a by the +// lower single-precision (32-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper 3 packed elements from a to +// the upper elements of dst. +// Warning: ARMv7-A does not produce the same result compared to Intel and not +// IEEE-compliant. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16 +#define _mm_extract_pi16(a, imm) \ + (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) + +// Free aligned memory that was allocated with _mm_malloc. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free +#if !defined(SSE2NEON_ALLOC_DEFINED) +FORCE_INLINE void _mm_free(void *addr) +{ + free(addr); +} +#endif + +FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) +{ + uint64_t value; +#if defined(_MSC_VER) + value = _ReadStatusReg(ARM64_FPCR); +#else + __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ +#endif + return value; +} + +FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) +{ +#if defined(_MSC_VER) + _WriteStatusReg(ARM64_FPCR, value); +#else + __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ +#endif +} + +// Macro: Get the flush zero bits from the MXCSR control and status register. +// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or +// _MM_FLUSH_ZERO_OFF +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE +FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; +} + +// Macro: Get the rounding mode bits from the MXCSR control and status register. +// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, +// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + if (r.field.bit22) { + return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; + } else { + return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + } +} + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16 +#define _mm_insert_pi16(a, b, imm) \ + vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))) + +// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from memory into dst. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps +FORCE_INLINE __m128 _mm_load_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Load a single-precision (32-bit) floating-point element from memory into the +// lower of dst, and zero the upper 3 elements. mem_addr does not need to be +// aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss +FORCE_INLINE __m128 _mm_load_ss(const float *p) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps +FORCE_INLINE __m128 _mm_load1_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); +} + +// Load 2 single-precision (32-bit) floating-point elements from memory into the +// upper 2 elements of dst, and copy the lower 2 elements from a to dst. +// mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); +} + +// Load 2 single-precision (32-bit) floating-point elements from memory into the +// lower 2 elements of dst, and copy the upper 2 elements from a to dst. +// mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); +} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) +{ + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); +} + +// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from memory into dst. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) +{ + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); +} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); +} + +// Allocate size bytes of memory, aligned to the alignment specified in align, +// and return a pointer to the allocated memory. _mm_free should be used to free +// memory that is allocated with _mm_malloc. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc +#if !defined(SSE2NEON_ALLOC_DEFINED) +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) +{ + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; +} +#endif + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64 +FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) +{ + int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x8_t masked = + vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), + vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); + vst1_s8((int8_t *) mem_addr, masked); +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq +#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b, +// and store packed maximum values in dst. dst does not follow the IEEE Standard +// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or +// signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b)); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper 3 +// packed elements from a to the upper element of dst. dst does not follow the +// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when +// inputs are NaN or signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b, +// and store packed minimum values in dst. dst does not follow the IEEE Standard +// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or +// signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b)); +#else + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper 3 +// packed elements from a to the upper element of dst. dst does not follow the +// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when +// inputs are NaN or signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Move the lower single-precision (32-bit) floating-point element from b to the +// lower element of dst, and copy the upper 3 packed elements from a to the +// upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); +} + +// Move the upper 2 single-precision (32-bit) floating-point elements from b to +// the lower 2 elements of dst, and copy the upper 2 elements from a to the +// upper 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps +FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) +{ +#if defined(aarch64__) + return vreinterpretq_m128_u64( + vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a))); +#else + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); +#endif +} + +// Move the lower 2 single-precision (32-bit) floating-point elements from b to +// the upper 2 elements of dst, and copy the lower 2 elements from a to the +// lower 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps +FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8 +FORCE_INLINE int _mm_movemask_pi8(__m64 a) +{ + uint8x8_t input = vreinterpret_u8_m64(a); +#if defined(__aarch64__) || defined(_M_ARM64) + static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8x8_t tmp = vshr_n_u8(input, 7); + return vaddv_u8(vshl_u8(tmp, vld1_s8(shift))); +#else + // Refer the implementation of `_mm_movemask_epi8` + uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); + uint32x2_t paired16 = + vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); + uint8x8_t paired32 = + vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); + return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); +#endif +} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed single-precision (32-bit) floating-point element in a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) || defined(_M_ARM64) + static const int32_t shift[4] = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift))); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif +} + +// Multiply packed single-precision (32-bit) floating-point elements in a and b, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_mul_ps(a, b)); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); +} + +// Compute the bitwise OR of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw +#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw +#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb +#define _m_pmovmskb(a) _mm_movemask_pi8(a) + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Fetch the line of data from memory that contains address p to a location in +// the cache hierarchy specified by the locality hint i. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch +FORCE_INLINE void _mm_prefetch(char const *p, int i) +{ + (void) i; +#if defined(_MSC_VER) + switch (i) { + case _MM_HINT_NTA: + __prefetch2(p, 1); + break; + case _MM_HINT_T0: + __prefetch2(p, 0); + break; + case _MM_HINT_T1: + __prefetch2(p, 2); + break; + case _MM_HINT_T2: + __prefetch2(p, 4); + break; + } +#else + switch (i) { + case _MM_HINT_NTA: + __builtin_prefetch(p, 0, 0); + break; + case _MM_HINT_T0: + __builtin_prefetch(p, 0, 3); + break; + case _MM_HINT_T1: + __builtin_prefetch(p, 0, 2); + break; + case _MM_HINT_T2: + __builtin_prefetch(p, 0, 1); + break; + } +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw +#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#endif + return vreinterpretq_m128_f32(recip); +} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) +{ + return _mm_move_ss(a, _mm_rcp_ps(a)); +} + +// Compute the approximate reciprocal square root of packed single-precision +// (32-bit) floating-point elements in a, and store the results in dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Generate masks for detecting whether input has any 0.0f/-0.0f + // (which becomes positive/negative infinity by IEEE-754 arithmetic rules). + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000); + const uint32x4_t has_pos_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(out)); + const uint32x4_t has_neg_zero = + vceqq_u32(neg_inf, vreinterpretq_u32_f32(out)); + + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#if SSE2NEON_PRECISE_SQRT + // Additional Netwon-Raphson iteration for accuracy + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#endif + + // Set output vector element to infinity/negative-infinity if + // the corresponding input vector element is 0.0f/-0.0f. + out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out); + out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out); + + return vreinterpretq_m128_f32(out); +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) +{ + uint64x1_t t = vpaddl_u32(vpaddl_u16( + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); + return vreinterpret_m64_u16( + vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); +} + +// Macro: Set the flush zero bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The flush zero may contain any of the +// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE +FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) +{ + // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, + // regardless of the value of the FZ bit. + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; + +#if defined(__aarch64__) || defined(_M_ARM64) + _sse2neon_set_fpcr(r.value); +#else + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +// Set packed single-precision (32-bit) floating-point elements in dst with the +// supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Broadcast single-precision (32-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1 +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + switch (rounding) { + case _MM_ROUND_TOWARD_ZERO: + r.field.bit22 = 1; + r.field.bit23 = 1; + break; + case _MM_ROUND_DOWN: + r.field.bit22 = 0; + r.field.bit23 = 1; + break; + case _MM_ROUND_UP: + r.field.bit22 = 1; + r.field.bit23 = 0; + break; + default: //_MM_ROUND_NEAREST + r.field.bit22 = 0; + r.field.bit23 = 0; + } + +#if defined(__aarch64__) || defined(_M_ARM64) + _sse2neon_set_fpcr(r.value); +#else + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0)); +} + +// Broadcast single-precision (32-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Set the MXCSR control and status register with the value in unsigned 32-bit +// integer a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr +// FIXME: _mm_setcsr() implementation supports changing the rounding mode only. +FORCE_INLINE void _mm_setcsr(unsigned int a) +{ + _MM_SET_ROUNDING_MODE(a); +} + +// Get the unsigned 32-bit value of the MXCSR control and status register. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr +// FIXME: _mm_getcsr() implementation supports reading the rounding mode only. +FORCE_INLINE unsigned int _mm_getcsr(void) +{ + return _MM_GET_ROUNDING_MODE(); +} + +// Set packed single-precision (32-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Return vector of type __m128 with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(0)); +} + +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16 +#ifdef _sse2neon_shuffle +#define _mm_shuffle_pi16(a, imm) \ + vreinterpret_m64_s16(vshuffle_s16( \ + vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ + ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))) +#else +#define _mm_shuffle_pi16(a, imm) \ + _sse2neon_define1( \ + __m64, a, int16x4_t ret; \ + ret = vmov_n_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \ + 3); \ + _sse2neon_return(vreinterpret_m64_s16(ret));) +#endif + +// Perform a serializing operation on all store-to-memory instructions that were +// issued prior to this instruction. Guarantees that every store instruction +// that precedes, in program order, is globally visible before any store +// instruction which follows the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence +FORCE_INLINE void _mm_sfence(void) +{ + _sse2neon_smp_mb(); +} + +// Perform a serializing operation on all load-from-memory and store-to-memory +// instructions that were issued prior to this instruction. Guarantees that +// every memory access that precedes, in program order, the memory fence +// instruction is globally visible before any memory instruction which follows +// the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence +FORCE_INLINE void _mm_mfence(void) +{ + _sse2neon_smp_mb(); +} + +// Perform a serializing operation on all load-from-memory instructions that +// were issued prior to this instruction. Guarantees that every load instruction +// that precedes, in program order, is globally visible before any load +// instruction which follows the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence +FORCE_INLINE void _mm_lfence(void) +{ + _sse2neon_smp_mb(); +} + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#ifdef _sse2neon_shuffle +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + float32x4_t _input1 = vreinterpretq_f32_m128(a); \ + float32x4_t _input2 = vreinterpretq_f32_m128(b); \ + float32x4_t _shuf = \ + vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128_f32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_ps(a, b, imm) \ + _sse2neon_define2( \ + __m128, a, b, __m128 ret; switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103(_a, _b); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps(_a, _b); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps(_b, _a); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032(_a, _b); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default(_a, _b, (imm)); \ + break; \ + } _sse2neon_return(ret);) +#endif + +// Compute the square root of packed single-precision (32-bit) floating-point +// elements in a, and store the results in dst. +// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement +// square root by multiplying input in with its reciprocal square root before +// using the Newton-Raphson method to approximate the results. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Test for vrsqrteq_f32(0) -> positive infinity case. + // Change to zero, so that s * 1/sqrt(s) result is zero too. + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t div_by_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + recip = vreinterpretq_f32_u32( + vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); + + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + + // sqrt(s) = s * 1/sqrt(s) + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); +#endif +} + +// Compute the square root of the lower single-precision (32-bit) floating-point +// element in a, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1 +FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + vst1q_f32(p, vdupq_n_f32(a0)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps +#define _mm_store1_ps _mm_store_ps1 + +// Store the upper 2 single-precision (32-bit) floating-point elements from a +// into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_high_f32(a)); +} + +// Store the lower 2 single-precision (32-bit) floating-point elements from a +// into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_low_f32(a)); +} + +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps +FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) +{ + float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); + float32x4_t rev = vextq_f32(tmp, tmp, 2); + vst1q_f32(p, rev); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores 16-bits of integer data a at the address p. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16 +FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) +{ + vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); +} + +// Stores 64-bits of integer data a at the address p. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64 +FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) +{ + vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); +} + +// Store 64-bits of integer data from a into memory using a non-temporal memory +// hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi +FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) +{ + vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#else + vst1q_f32(p, vreinterpretq_f32_m128(a)); +#endif +} + +// Subtract packed single-precision (32-bit) floating-point elements in b from +// packed single-precision (32-bit) floating-point elements in a, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ + vget_low_f32(ROW23.val[0])); \ + row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ + vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +// Return vector of type __m128i with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128 +FORCE_INLINE __m128i _mm_undefined_si128(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128i a; +#if defined(_MSC_VER) + a = _mm_setzero_si128(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Return vector of type __m128 with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; +#if defined(_MSC_VER) + a = _mm_setzero_ps(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Unpack and interleave single-precision (32-bit) floating-point elements from +// the high half a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave single-precision (32-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Compute the bitwise XOR of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +/* SSE2 */ + +// Add packed 16-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16 +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed 32-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32 +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Add packed 64-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64 +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Add packed 8-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8 +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1] + db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_add_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add 64-bit integers a and b, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Add packed signed 16-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16 +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16 +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Add packed unsigned 8-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8 +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128 +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) +{ + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); +} + +// Compute the bitwise NOT of 128 bits (representing integer data) in a and then +// AND with b, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128 +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16 +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) +{ + return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8 +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128 +#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128 +#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) + +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) +{ + return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) +{ + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) +{ + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128 to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128 +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); +#else + return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); +#endif +} + +// Cast vector of type __m128i to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); +} + +// Invalidate and flush the cache line that contains p from all levels of the +// cache hierarchy. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush +#if defined(__APPLE__) +#include +#endif +FORCE_INLINE void _mm_clflush(void const *p) +{ + (void) p; + + /* sys_icache_invalidate is supported since macOS 10.5. + * However, it does not work on non-jailbroken iOS devices, although the + * compilation is successful. + */ +#if defined(__APPLE__) + sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE); +#elif defined(__GNUC__) || defined(__clang__) + uintptr_t ptr = (uintptr_t) p; + __builtin___clear_cache((char *) ptr, + (char *) ptr + SSE2NEON_CACHELINE_SIZE); +#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H + FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE); +#endif +} + +// Compare packed 16-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16 +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32 +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed 8-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8 +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd +FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd +FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd +FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpge_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed signed 16-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16 +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 32-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32 +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8 +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd +FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd +FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd +FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd +FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmple_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed signed 16-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16 +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 32-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32 +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8 +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd +FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd +FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmplt_pd(a, b)); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd +FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd +FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd +FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd +FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpnge_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd +FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd +FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpngt_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd +FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd +FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpnle_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd +FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = + !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = + !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd +FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpnlt_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if neither is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd +FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + // Excluding NaNs, any two floating point numbers can be compared. + uint64x2_t not_nan_a = + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); + uint64x2_t not_nan_b = + vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? ~UINT64_C(0) + : UINT64_C(0); + d[1] = ((*(double *) &a1) == (*(double *) &a1) && + (*(double *) &b1) == (*(double *) &b1)) + ? ~UINT64_C(0) + : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if neither is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd +FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpord_pd(a, b)); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? ~UINT64_C(0) + : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if either is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd +FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + // Two NaNs are not equal in comparison operation. + uint64x2_t not_nan_a = + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); + uint64x2_t not_nan_b = + vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_s32( + vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? UINT64_C(0) + : ~UINT64_C(0); + d[1] = ((*(double *) &a1) == (*(double *) &a1) && + (*(double *) &b1) == (*(double *) &b1)) + ? UINT64_C(0) + : ~UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if either is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd +FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t d[2]; + d[0] = ((*(double *) &a0) == (*(double *) &a0) && + (*(double *) &b0) == (*(double *) &b0)) + ? UINT64_C(0) + : ~UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd +FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + + return (*(double *) &a0 >= *(double *) &b0); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd +FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + + return (*(double *) &a0 > *(double *) &b0); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd +FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + + return (*(double *) &a0 <= *(double *) &b0); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd +FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + + return (*(double *) &a0 < *(double *) &b0); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd +FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; +#else + uint32x4_t a_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); + uint32x4_t b_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), + vreinterpretq_u64_u32(a_eq_b)); + return vgetq_lane_u64(and_results, 0) & 0x1; +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for not-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd +FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) +{ + return !_mm_comieq_sd(a, b); +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd +FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); +#else + double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); + double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 +FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) +{ +// vrnd32xq_f64 not supported on clang +#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) + float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a)); + int64x2_t integers = vcvtq_s64_f64(rounded); + return vreinterpretq_m128i_s32( + vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double d0 = ((double *) &rnd)[0]; + double d1 = ((double *) &rnd)[1]; + return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); +#endif +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 +FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) +{ + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double d0 = ((double *) &rnd)[0]; + double d1 = ((double *) &rnd)[1]; + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; + return vreinterpret_m64_s32(vld1_s32(data)); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + float a0 = (float) ((double *) &a)[0]; + float a1 = (float) ((double *) &a)[1]; + return _mm_set_ps(0, 0, a1, a0); +#endif +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd +FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); +#else + double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); + double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32 +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if defined(__ARM_FEATURE_FRINT) + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a))); +#elif (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + switch (_MM_GET_ROUNDING_MODE()) { + case _MM_ROUND_NEAREST: + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); + case _MM_ROUND_DOWN: + return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); + case _MM_ROUND_UP: + return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); + default: // _MM_ROUND_TOWARD_ZERO + return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); + } +#else + float *f = (float *) &a; + switch (_MM_GET_ROUNDING_MODE()) { + case _MM_ROUND_NEAREST: { + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32( + vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = + vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32( + vbslq_s32(is_delta_half, r_even, r_normal)); + } + case _MM_ROUND_DOWN: + return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), + floorf(f[0])); + case _MM_ROUND_UP: + return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), + ceilf(f[0])); + default: // _MM_ROUND_TOWARD_ZERO + return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], + (int32_t) f[0]); + } +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + return ((double *) &a)[0]; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32 +FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double ret = ((double *) &rnd)[0]; + return (int32_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64 +FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double ret = ((double *) &rnd)[0]; + return (int64_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x +#define _mm_cvtsd_si64x _mm_cvtsd_si64 + +// Convert the lower double-precision (64-bit) floating-point element in b to a +// single-precision (32-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper 3 packed elements from a to the +// upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss +FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32(vsetq_lane_f32( + vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), + vreinterpretq_f32_m128(a), 0)); +#else + return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], + vreinterpretq_f32_m128(a), 0)); +#endif +} + +// Copy the lower 32-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) +{ + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Convert the signed 32-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd +FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); +#else + double bf = (double) b; + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Copy 32-bit integer a to the lower elements of dst, and zero the upper +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128 +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd +FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); +#else + double bf = (double) b; + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Copy 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) +{ + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); +} + +// Copy 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128 +#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd +#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd +FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) +{ + double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 +FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) +{ + double a0 = ((double *) &a)[0]; + double a1 = ((double *) &a)[1]; + return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 +FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) +{ + double a0 = ((double *) &a)[0]; + double a1 = ((double *) &a)[1]; + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; + return vreinterpret_m64_s32(vld1_s32(data)); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32 +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 +FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) +{ + double ret = *((double *) &a); + return (int32_t) ret; +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64 +FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); +#else + double ret = *((double *) &a); + return (int64_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x +#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) + +// Divide packed double-precision (64-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd +FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] / db[0]; + c[1] = da[1] / db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Divide the lower double-precision (64-bit) floating-point element in a by the +// lower double-precision (64-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper element from a to the upper +// element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd +FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t tmp = + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_f64( + vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); +#else + return _mm_move_sd(a, _mm_div_pd(a, b)); +#endif +} + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16 +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16 +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))) + +// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from memory into dst. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Load 128-bits of integer data from memory into dst. mem_addr must be aligned +// on a 16-byte boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128 +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); +#endif +} + +// Load 64-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64 +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) +{ + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); +#else + return vreinterpretq_m128d_f32( + vcombine_f32(vld1_f32((const float *) p), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +#endif +} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *) p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif +} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) +{ + return _mm_load_pd(p); +} + +// Load 128-bits of integer data from memory into dst. mem_addr does not need to +// be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) +{ + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Horizontally add adjacent pairs of intermediate +// 32-bit integers, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16 +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); +#if defined(__aarch64__) || defined(_M_ARM64) + int32x4_t high = + vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); + + return vreinterpretq_m128i_s32(vpaddq_s32(low, high)); +#else + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +#endif +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. mem_addr does not need to be aligned +// on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128 +FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) +{ + int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x16_t masked = + vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), + vreinterpretq_s8_m128(b)); + vst1q_s8((int8_t *) mem_addr, masked); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16 +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8 +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed maximum values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd +FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_PRECISE_MINMAX + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); + return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b)); +#else + return vreinterpretq_m128d_f64( + vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#endif +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; + d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd +FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_max_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); +#endif +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16 +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8 +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed minimum values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd +FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_PRECISE_MINMAX + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); + return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b)); +#else + return vreinterpretq_m128d_f64( + vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#endif +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; + d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd +FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_min_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); +#endif +} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); +} + +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8 +FORCE_INLINE int _mm_movemask_epi8(__m128i a) +{ + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed double-precision (64-bit) floating-point element in a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd +FORCE_INLINE int _mm_movemask_pd(__m128d a) +{ + uint64x2_t input = vreinterpretq_u64_m128d(a); + uint64x2_t high_bits = vshrq_n_u64(input, 63); + return (int) (vgetq_lane_u64(high_bits, 0) | + (vgetq_lane_u64(high_bits, 1) << 1)); +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) +{ + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); +} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); +} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32 +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +} + +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] * db[0]; + c[1] = da[1] * db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_mul_pd(a, b)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +{ + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +} + +// Multiply the packed signed 16-bit integers in a and b, producing intermediate +// 32-bit integers, and store the high 16 bits of the intermediate integers in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16 +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + /* FIXME: issue with large values because of result saturation */ + // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), + // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return + // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16 +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) +{ + uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab3210 = vmull_u16(a3210, b3210); +#if defined(__aarch64__) || defined(_M_ARM64) + uint32x4_t ab7654 = + vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), + vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r); +#else + uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab7654 = vmull_u16(a7654, b7654); + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +#endif +} + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and store the low 16 bits of the intermediate integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16 +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise OR of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128 +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Convert packed signed 16-bit integers from a and b to packed 8-bit integers +// using signed saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16 +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); +} + +// Convert packed signed 32-bit integers from a and b to packed 16-bit integers +// using signed saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32 +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); +} + +// Convert packed signed 16-bit integers from a and b to packed 8-bit integers +// using unsigned saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16 +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); +} + +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit because it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause +FORCE_INLINE void _mm_pause(void) +{ +#if defined(_MSC_VER) + __isb(_ARM64_BARRIER_SY); +#else + __asm__ __volatile__("isb\n"); +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +{ + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); +} + +// Set packed 16-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16 +FORCE_INLINE __m128i _mm_set_epi16(short i7, + short i6, + short i5, + short i4, + short i3, + short i2, + short i1, + short i0) +{ + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); +} + +// Set packed 32-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32 +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64 +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) +{ + return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0)); +} + +// Set packed 64-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); +} + +// Set packed 8-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8 +FORCE_INLINE __m128i _mm_set_epi8(signed char b15, + signed char b14, + signed char b13, + signed char b12, + signed char b11, + signed char b10, + signed char b9, + signed char b8, + signed char b7, + signed char b6, + signed char b5, + signed char b4, + signed char b3, + signed char b2, + signed char b1, + signed char b0) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) +{ + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); +#endif +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1 +#define _mm_set_pd1 _mm_set1_pd + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd +FORCE_INLINE __m128d _mm_set_sd(double a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0)); +#else + return _mm_set_pd(0, a); +#endif +} + +// Broadcast 16-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16 +FORCE_INLINE __m128i _mm_set1_epi16(short w) +{ + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); +} + +// Broadcast 32-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32 +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); +} + +// Broadcast 64-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64 +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) +{ + return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0)); +} + +// Broadcast 64-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); +} + +// Broadcast 8-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8 +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) +{ + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd +FORCE_INLINE __m128d _mm_set1_pd(double d) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vdupq_n_f64(d)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); +#endif +} + +// Set packed 16-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, + short w1, + short w2, + short w3, + short w4, + short w5, + short w6, + short w7) +{ + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); +} + +// Set packed 32-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32 +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) +{ + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); +} + +// Set packed 8-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8 +FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, + signed char b1, + signed char b2, + signed char b3, + signed char b4, + signed char b5, + signed char b6, + signed char b7, + signed char b8, + signed char b9, + signed char b10, + signed char b11, + signed char b12, + signed char b13, + signed char b14, + signed char b15) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) +{ + return _mm_set_pd(e0, e1); +} + +// Return vector of type __m128d with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd +FORCE_INLINE __m128d _mm_setzero_pd(void) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#else + return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#endif +} + +// Return vector of type __m128i with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128 +FORCE_INLINE __m128i _mm_setzero_si128(void) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); +} + +// Shuffle 32-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32 +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2neon_shuffle) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = \ + vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + _sse2neon_define1( \ + __m128i, a, __m128i ret; switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032(_a); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301(_a); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321(_a); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103(_a); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010(_a); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001(_a); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101(_a); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211(_a); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122(_a); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332(_a); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat(_a, 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat(_a, 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat(_a, 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat(_a, 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default(_a, (imm)); \ + break; \ + } _sse2neon_return(ret);) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd +#ifdef _sse2neon_shuffle +#define _mm_shuffle_pd(a, b, imm8) \ + vreinterpretq_m128d_s64( \ + vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \ + imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2)) +#else +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#endif + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2neon_shuffle) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = \ + vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2neon_shuffle) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = vshuffleq_s16( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shift packed 16-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16 +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t) c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); +} + +// Shift packed 32-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32 +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t) c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); +} + +// Shift packed 64-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64 +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t) c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); +} + +// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16 +FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) +{ + if (_sse2neon_unlikely(imm & ~15)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s16( + vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); +} + +// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32 +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) +{ + if (_sse2neon_unlikely(imm & ~31)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); +} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64 +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +{ + if (_sse2neon_unlikely(imm & ~63)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128 +#define _mm_slli_si128(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int8x16_t ret; \ + if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \ + else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ + else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ + ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \ + _sse2neon_return(vreinterpretq_m128i_s8(ret));) + +// Compute the square root of packed double-precision (64-bit) floating-point +// elements in a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd +FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); +#else + double a0 = sqrt(((double *) &a)[0]); + double a1 = sqrt(((double *) &a)[1]); + return _mm_set_pd(a1, a0); +#endif +} + +// Compute the square root of the lower double-precision (64-bit) floating-point +// element in b, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd +FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_sqrt_pd(b)); +#else + return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0])); +#endif +} + +// Shift packed 16-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16 +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) +{ + int64_t c = vgetq_lane_s64(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16( + vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c))); +} + +// Shift packed 32-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32 +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) +{ + int64_t c = vgetq_lane_s64(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32( + vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c))); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in sign +// bits, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) +{ + const int count = (imm & ~15) ? 15 : imm; + return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); +} + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \ + ret = _a; \ + } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \ + } _sse2neon_return(ret);) + +// Shift packed 16-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16 +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t) c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); +} + +// Shift packed 32-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32 +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t) c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); +} + +// Shift packed 64-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64 +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t) c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \ + } _sse2neon_return(ret);) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \ + } _sse2neon_return(ret);) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \ + } _sse2neon_return(ret);) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128 +#define _mm_srli_si128(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int8x16_t ret; \ + if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ + else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ + (imm > 15 ? 0 : imm)); \ + _sse2neon_return(vreinterpretq_m128i_s8(ret));) + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1 +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); + vst1q_f64((float64_t *) mem_addr, + vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); +#else + float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); + vst1q_f32((float32_t *) mem_addr, + vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd +FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); +#endif +} + +// Store 128-bits of integer data from a into memory. mem_addr must be aligned +// on a 16-byte boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128 +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd +#define _mm_store1_pd _mm_store_pd1 + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 64-bit integer from the first element of a into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64 +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) +{ + vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b))); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) +{ + float32x4_t f = vreinterpretq_f32_m128d(a); + _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) +{ + _mm_store_pd(mem_addr, a); +} + +// Store 128-bits of integer data from a into memory. mem_addr does not need to +// be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128 +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Store 32-bit integer from the first element of a into memory. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32 +FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) +{ + vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory using a non-temporal memory hint. mem_addr must +// be aligned on a 16-byte boundary or a general-protection exception may be +// generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd +FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (__m128d *) p); +#elif defined(__aarch64__) || defined(_M_ARM64) + vst1q_f64(p, vreinterpretq_f64_m128d(a)); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); +#endif +} + +// Store 128-bits of integer data from a into memory using a non-temporal memory +// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection +// exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128 +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); +#endif +} + +// Store 32-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32 +FORCE_INLINE void _mm_stream_si32(int *p, int a) +{ + vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); +} + +// Store 64-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64 +FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) +{ + vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a)); +} + +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16 +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32 +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8 +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] - db[0]; + c[1] = da[1] - db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_sub_pd(a, b)); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a +// using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16 +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a +// using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8 +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit +// integers in a using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16 +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit +// integers in a using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8 +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +#define _mm_ucomieq_sd _mm_comieq_sd +#define _mm_ucomige_sd _mm_comige_sd +#define _mm_ucomigt_sd _mm_comigt_sd +#define _mm_ucomile_sd _mm_comile_sd +#define _mm_ucomilt_sd _mm_comilt_sd +#define _mm_ucomineq_sd _mm_comineq_sd + +// Return vector of type __m128d with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd +FORCE_INLINE __m128d _mm_undefined_pd(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128d a; +#if defined(_MSC_VER) + a = _mm_setzero_pd(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Unpack and interleave 16-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16 +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 32-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32 +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 64-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s64( + vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +#endif +} + +// Unpack and interleave 8-bit integers from the high half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8 +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), + vget_high_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Unpack and interleave 16-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16 +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 32-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32 +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 64-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64 +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s64( + vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +#endif +} + +// Unpack and interleave 8-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8 +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), + vget_low_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise XOR of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128 +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +/* SSE3 */ + +// Alternatively add and subtract packed double-precision (64-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd +FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) +{ + _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), + vreinterpretq_f64_m128d(b), + vreinterpretq_f64_m128d(mask))); +#else + return _mm_add_pd(_mm_mul_pd(b, mask), a); +#endif +} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) +{ + _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_FMA) /* VFPv4+ */ + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), + vreinterpretq_f32_m128(mask), + vreinterpretq_f32_m128(b))); +#else + return _mm_add_ps(_mm_mul_ps(b, mask), a); +#endif +} + +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[] = {da[0] + da[1], db[0] + db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Horizontally add adjacent pairs of single-precision (32-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif +} + +// Horizontally subtract adjacent pairs of double-precision (64-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd +FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t a = vreinterpretq_f64_m128d(_a); + float64x2_t b = vreinterpretq_f64_m128d(_b); + return vreinterpretq_m128d_f64( + vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); +#else + double *da = (double *) &_a; + double *db = (double *) &_b; + double c[] = {da[0] - da[1], db[0] - db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Horizontally subtract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) +{ + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); +#else + float32x4x2_t c = vuzpq_f32(a, b); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif +} + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_u64( + vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); +#endif +} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); +#elif defined(_sse2neon_shuffle) + return vreinterpretq_m128_f32(vshuffleq_s32( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); +#elif defined(_sse2neon_shuffle) + return vreinterpretq_m128_f32(vshuffleq_s32( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +/* SSSE3 */ + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) +{ + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) +{ + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) +{ + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) +{ + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) +{ + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} + +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8 +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + uint8x16_t _a = vreinterpretq_u8_m128i(a); \ + uint8x16_t _b = vreinterpretq_u8_m128i(b); \ + __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) \ + ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if (imm >= 16) \ + ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \ + else \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \ + ret; \ + }) + +#else +#define _mm_alignr_epi8(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ + uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) ret = \ + vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if (imm >= 16) ret = \ + _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \ + else ret = \ + vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \ + _sse2neon_return(ret);) + +#endif + +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8 +#define _mm_alignr_pi8(a, b, imm) \ + _sse2neon_define2( \ + __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else { \ + uint8x8_t tmp_low; \ + uint8x8_t tmp_high; \ + if ((imm) >= 8) { \ + const int idx = (imm) -8; \ + tmp_low = vreinterpret_u8_m64(_a); \ + tmp_high = vdup_n_u8(0); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = (imm); \ + tmp_low = vreinterpret_u8_m64(_b); \ + tmp_high = vreinterpret_u8_m64(_a); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } \ + } _sse2neon_return(ret);) + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16 +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32 +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32(vpaddq_s32(a, b)); +#else + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); +#endif +} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +{ + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Horizontally add adjacent pairs of signed 16-bit integers in a and b using +// saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16 +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); +#endif +} + +// Horizontally add adjacent pairs of signed 16-bit integers in a and b using +// saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16 +FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); +#else + int16x4x2_t res = vuzp_s16(a, b); + return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack +// the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16 +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int16x8x2_t c = vuzpq_s16(a, b); + return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack +// the signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32 +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32( + vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); +#else + int32x4x2_t c = vuzpq_s32(a, b); + return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack +// the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16 +FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); +#else + int16x4x2_t c = vuzp_s16(a, b); + return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack +// the signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32 +FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); +#else + int32x2x2_t c = vuzp_s32(a, b); + return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b +// using saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int16x8x2_t c = vuzpq_s16(a, b); + return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b +// using saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16 +FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); +#else + int16x4x2_t c = vuzp_s16(a, b); + return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1])); +#endif +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); + + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and +// pack the saturated results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16 +FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) +{ + uint16x4_t a = vreinterpret_u16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // Zero extend a + int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8)); + int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff))); + + // Sign extend by shifting left then shifting right. + int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8); + int16x4_t b_odd = vshr_n_s16(b, 8); + + // multiply + int16x4_t prod1 = vmul_s16(a_even, b_even); + int16x4_t prod2 = vmul_s16(a_odd, b_odd); + + // saturated add + return vreinterpret_m64_s16(vqadd_s16(prod1, prod2)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16 +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +{ + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Truncate each intermediate integer to the 18 most +// significant bits, round by adding 1, and store bits [16:1] to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16 +FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) +{ + int32x4_t mul_extend = + vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b))); + + // Rounding narrowing shift right + return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8 +FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) +{ + const int8x8_t controlMask = + vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07))); + int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); + return vreinterpret_m64_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16 +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); +#else + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32 +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8 +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) +{ + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); +#else + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); +#else + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) +{ + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); +} + +/* SSE4.1 */ + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16 +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, \ + const uint16_t _mask[8] = \ + _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ + uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));) + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using control mask imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd +#define _mm_blend_pd(a, b, imm) \ + _sse2neon_define2( \ + __m128d, a, b, \ + const uint64_t _mask[2] = \ + _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ + ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \ + uint64x2_t _mask_vec = vld1q_u64(_mask); \ + uint64x2_t __a = vreinterpretq_u64_m128d(_a); \ + uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \ + vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));) + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps +FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) +{ + const uint32_t ALIGN_STRUCT(16) + data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + uint32x4_t mask = vld1q_u32(data); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8 +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd +FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) +{ + uint64x2_t mask = + vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t a = vreinterpretq_f64_m128d(_a); + float64x2_t b = vreinterpretq_f64_m128d(_b); + return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); +#else + uint64x2_t a = vreinterpretq_u64_m128d(_a); + uint64x2_t b = vreinterpretq_u64_m128d(_b); + return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); +#endif +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint32x4_t mask = + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Round the packed double-precision (64-bit) floating-point elements in a up +// to an integer value, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd +FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); +#else + double *f = (double *) &a; + return _mm_set_pd(ceil(f[1]), ceil(f[0])); +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); +#else + float *f = (float *) &a; + return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); +#endif +} + +// Round the lower double-precision (64-bit) floating-point element in b up to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd +FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_ceil_pd(b)); +} + +// Round the lower single-precision (32-bit) floating-point element in b up to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss +FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_ceil_ps(b)); +} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32 +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +} + +// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64 +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +{ + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64 +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} + +// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16 +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); +} + +// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32 +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); +} + +// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit +// integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64 +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32 +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) +{ + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +} + +// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64 +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) +{ + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64 +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) +{ + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +} + +// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16 +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); +} + +// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32 +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); +} + +// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed +// 64-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64 +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Conditionally multiply the packed double-precision (64-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, and +// conditionally store the sum in dst using the low 4 bits of imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd +FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) +{ + // Generate mask value from constant immediate bit value + const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; + const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; +#if !SSE2NEON_PRECISE_DP + const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; + const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; +#endif + // Conditional multiplication +#if !SSE2NEON_PRECISE_DP + __m128d mul = _mm_mul_pd(a, b); + const __m128d mulMask = + _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); + __m128d tmp = _mm_and_pd(mul, mulMask); +#else +#if defined(__aarch64__) || defined(_M_ARM64) + double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * + vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) + : 0; + double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) * + vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) + : 0; +#else + double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; + double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; +#endif + __m128d tmp = _mm_set_pd(d1, d0); +#endif + // Sum the products +#if defined(__aarch64__) || defined(_M_ARM64) + double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); +#else + double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); +#endif + // Conditionally store the sum + const __m128d sumMask = + _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask)); + __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask); + return res; +} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) +{ + float32x4_t elementwise_prod = _mm_mul_ps(a, b); + +#if defined(__aarch64__) || defined(_M_ARM64) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(elementwise_prod)); + } + + if ((imm & 0x0F) == 0x0F) { + if (!(imm & (1 << 4))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0); + if (!(imm & (1 << 5))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1); + if (!(imm & (1 << 6))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2); + if (!(imm & (1 << 7))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3); + + return _mm_set1_ps(vaddvq_f32(elementwise_prod)); + } +#endif + + float s = 0.0f; + + if (imm & (1 << 4)) + s += vgetq_lane_f32(elementwise_prod, 0); + if (imm & (1 << 5)) + s += vgetq_lane_f32(elementwise_prod, 1); + if (imm & (1 << 6)) + s += vgetq_lane_f32(elementwise_prod, 2); + if (imm & (1 << 7)) + s += vgetq_lane_f32(elementwise_prod, 3); + + const float32_t res[4] = { + (imm & 0x1) ? s : 0.0f, + (imm & 0x2) ? s : 0.0f, + (imm & 0x4) ? s : 0.0f, + (imm & 0x8) ? s : 0.0f, + }; + return vreinterpretq_m128_f32(vld1q_f32(res)); +} + +// Extract a 32-bit integer from a, selected with imm8, and store the result in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32 +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extract a 64-bit integer from a, selected with imm8, and store the result in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64 +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Extract an 8-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a, +// __constrange(0,16) int imm) +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8 +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Round the packed double-precision (64-bit) floating-point elements in a down +// to an integer value, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd +FORCE_INLINE __m128d _mm_floor_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); +#else + double *f = (double *) &a; + return _mm_set_pd(floor(f[1]), floor(f[0])); +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); +#else + float *f = (float *) &a; + return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); +#endif +} + +// Round the lower double-precision (64-bit) floating-point element in b down to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd +FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_floor_pd(b)); +} + +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_floor_ps(b)); +} + +// Copy a to dst, and insert the 32-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32 +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))) + +// Copy a to dst, and insert the 64-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64 +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))) + +// Copy a to dst, and insert the lower 8-bit integer from i into dst at the +// location specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8 +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))) + +// Copy a to tmp, then insert a single-precision (32-bit) floating-point +// element from b into tmp using the control in imm8. Store tmp to dst using +// the mask in imm8 (elements are zeroed out when the corresponding bit is set). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps +#define _mm_insert_ps(a, b, imm8) \ + _sse2neon_define2( \ + __m128, a, b, \ + float32x4_t tmp1 = \ + vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \ + vreinterpretq_f32_m128(_a), 0); \ + float32x4_t tmp2 = \ + vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ + vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \ + const uint32_t data[4] = \ + _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ + uint32x4_t mask = vld1q_u32(data); \ + float32x4_t all_zeros = vdupq_n_f32(0); \ + \ + _sse2neon_return(vreinterpretq_m128_f32( \ + vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));) + +// Compare packed signed 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32 +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16 +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Compare packed signed 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32 +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8 +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16 +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16 +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) +{ + __m128i dst; + uint16_t min, idx = 0; +#if defined(__aarch64__) || defined(_M_ARM64) + // Find the minimum value + min = vminvq_u16(vreinterpretq_u16_m128i(a)); + + // Get the index of the minimum value + static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7}; + uint16x8_t minv = vdupq_n_u16(min); + uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a)); + idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq)); +#else + // Find the minimum value + __m64 tmp; + tmp = vreinterpret_m64_u16( + vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), + vget_high_u16(vreinterpretq_u16_m128i(a)))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); + // Get the index of the minimum value + int i; + for (i = 0; i < 8; i++) { + if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { + idx = (uint16_t) i; + break; + } + a = _mm_srli_si128(a, 2); + } +#endif + // Generate result + dst = _mm_setzero_si128(); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); + return dst; +} + +// Compute the sum of absolute differences (SADs) of quadruplets of unsigned +// 8-bit integers in a compared to those in b, and store the 16-bit results in +// dst. Eight SADs are performed using one quadruplet from b and eight +// quadruplets from a. One quadruplet is selected from b starting at on the +// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit +// integers selected from a starting at the offset specified in imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8 +FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) +{ + uint8x16_t _a, _b; + + switch (imm & 0x4) { + case 0: + // do nothing + _a = vreinterpretq_u8_m128i(a); + break; + case 4: + _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a), + vreinterpretq_u32_m128i(a), 1)); + break; + default: +#if defined(__GNUC__) || defined(__clang__) + __builtin_unreachable(); +#elif defined(_MSC_VER) + __assume(0); +#endif + break; + } + + switch (imm & 0x3) { + case 0: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0))); + break; + case 1: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1))); + break; + case 2: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2))); + break; + case 3: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); + break; + default: +#if defined(__GNUC__) || defined(__clang__) + __builtin_unreachable(); +#elif defined(_MSC_VER) + __assume(0); +#endif + break; + } + + int16x8_t c04, c15, c26, c37; + uint8x8_t low_b = vget_low_u8(_b); + c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b)); + uint8x16_t _a_1 = vextq_u8(_a, _a, 1); + c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b)); + uint8x16_t _a_2 = vextq_u8(_a, _a, 2); + c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b)); + uint8x16_t _a_3 = vextq_u8(_a, _a, 3); + c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b)); +#if defined(__aarch64__) || defined(_M_ARM64) + // |0|4|2|6| + c04 = vpaddq_s16(c04, c26); + // |1|5|3|7| + c15 = vpaddq_s16(c15, c37); + + int32x4_t trn1_c = + vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); + int32x4_t trn2_c = + vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); + return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c), + vreinterpretq_s16_s32(trn2_c))); +#else + int16x4_t c01, c23, c45, c67; + c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15)); + c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37)); + c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15)); + c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37)); + + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67))); +#endif +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit +// integers, and store the low 32 bits of the intermediate integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32 +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Convert packed signed 32-bit integers from a and b to packed 16-bit integers +// using unsigned saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32 +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Round the packed double-precision (64-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd +FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return _mm_floor_pd(a); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return _mm_ceil_pd(a); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); + } +#else + double *v_double = (double *) &a; + + if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { + double res[2], tmp; + for (int i = 0; i < 2; i++) { + tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i]; + double roundDown = floor(tmp); // Round down value + double roundUp = ceil(tmp); // Round up value + double diffDown = tmp - roundDown; + double diffUp = roundUp - tmp; + if (diffDown < diffUp) { + /* If it's closer to the round down value, then use it */ + res[i] = roundDown; + } else if (diffDown > diffUp) { + /* If it's closer to the round up value, then use it */ + res[i] = roundUp; + } else { + /* If it's equidistant between round up and round down value, + * pick the one which is an even number */ + double half = roundDown / 2; + if (half != floor(half)) { + /* If the round down value is odd, return the round up value + */ + res[i] = roundUp; + } else { + /* If the round up value is odd, return the round down value + */ + res[i] = roundDown; + } + } + res[i] = (v_double[i] < 0) ? -res[i] : res[i]; + } + return _mm_set_pd(res[1], res[0]); + } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { + return _mm_floor_pd(a); + } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { + return _mm_ceil_pd(a); + } + return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]), + v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0])); +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return _mm_floor_ps(a); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return _mm_ceil_ps(a); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *) &a; + + if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32( + vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = + vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); + } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { + return _mm_floor_ps(a); + } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { + return _mm_ceil_ps(a); + } + return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]), + v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]), + v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]), + v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0])); +#endif +} + +// Round the lower double-precision (64-bit) floating-point element in b using +// the rounding parameter, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd +FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) +{ + return _mm_move_sd(a, _mm_round_pd(b, rounding)); +} + +// Round the lower single-precision (32-bit) floating-point element in b using +// the rounding parameter, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. Rounding is done according to the +// rounding[3:0] parameter, which can be one of: +// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and +// suppress exceptions +// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and +// suppress exceptions +// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress +// exceptions +// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress +// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see +// _MM_SET_ROUNDING_MODE +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss +FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) +{ + return _mm_move_ss(a, _mm_round_ps(b, rounding)); +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +{ +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) +{ + return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t) 0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +{ + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute +// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is +// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero +// Note: Argument names may be wrong in the Intel intrinsics guide. +FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) +{ + uint64x2_t v = vreinterpretq_u64_m128i(a); + uint64x2_t m = vreinterpretq_u64_m128i(mask); + + // find ones (set-bits) and zeros (clear-bits) under clip mask + uint64x2_t ones = vandq_u64(m, v); + uint64x2_t zeros = vbicq_u64(m, v); + + // If both 128-bit variables are populated (non-zero) then return 1. + // For comparision purposes, first compact each var down to 32-bits. + uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros)); + + // if folding minimum is non-zero then both vars must be non-zero + return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128 +#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +/* SSE4.2 */ + +static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, +}; +static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, +}; + +/* specify the source data format */ +#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */ +#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */ +#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */ +#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */ + +/* specify the comparison operation */ +#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */ +#define _SIDD_CMP_RANGES 0x04 /* compare ranges */ +#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */ +#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */ + +/* specify the polarity */ +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */ +#define _SIDD_MASKED_NEGATIVE_POLARITY \ + 0x30 /* negate results only before end of string */ + +/* specify the output selection in _mm_cmpXstri */ +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +/* specify the output selection in _mm_cmpXstrm */ +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + +/* Pattern Matching for C macros. + * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms + */ + +/* catenate */ +#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__ +#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b) + +#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c) +/* run the 2nd parameter */ +#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__ +/* run the 1st parameter */ +#define SSE2NEON_IIF_1(t, ...) t + +#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b) +#define SSE2NEON_COMPL_0 1 +#define SSE2NEON_COMPL_1 0 + +#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x) +#define SSE2NEON_DEC_1 0 +#define SSE2NEON_DEC_2 1 +#define SSE2NEON_DEC_3 2 +#define SSE2NEON_DEC_4 3 +#define SSE2NEON_DEC_5 4 +#define SSE2NEON_DEC_6 5 +#define SSE2NEON_DEC_7 6 +#define SSE2NEON_DEC_8 7 +#define SSE2NEON_DEC_9 8 +#define SSE2NEON_DEC_10 9 +#define SSE2NEON_DEC_11 10 +#define SSE2NEON_DEC_12 11 +#define SSE2NEON_DEC_13 12 +#define SSE2NEON_DEC_14 13 +#define SSE2NEON_DEC_15 14 +#define SSE2NEON_DEC_16 15 + +/* detection */ +#define SSE2NEON_CHECK_N(x, n, ...) n +#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, ) +#define SSE2NEON_PROBE(x) x, 1, + +#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x)) +#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~) + +#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x)) +#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c)) + +#define SSE2NEON_EAT(...) +#define SSE2NEON_EXPAND(...) __VA_ARGS__ +#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT) + +/* recursion */ +/* deferred expression */ +#define SSE2NEON_EMPTY() +#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY() +#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)() +#define SSE2NEON_EXPAND(...) __VA_ARGS__ + +#define SSE2NEON_EVAL(...) \ + SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__))) +#define SSE2NEON_EVAL1(...) \ + SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__))) +#define SSE2NEON_EVAL2(...) \ + SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__))) +#define SSE2NEON_EVAL3(...) __VA_ARGS__ + +#define SSE2NEON_REPEAT(count, macro, ...) \ + SSE2NEON_WHEN(count) \ + (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \ + SSE2NEON_DEC(count), macro, \ + __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \ + __VA_ARGS__)) +#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT + +#define SSE2NEON_SIZE_OF_byte 8 +#define SSE2NEON_NUMBER_OF_LANES_byte 16 +#define SSE2NEON_SIZE_OF_word 16 +#define SSE2NEON_NUMBER_OF_LANES_word 8 + +#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \ + mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \ + vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \ + vreinterpretq_##type##_m128i(a))); + +#define SSE2NEON_FILL_LANE(i, type) \ + vec_b[i] = \ + vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)); + +#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \ + number_of_lanes, byte_or_word) \ + do { \ + SSE2NEON_CAT( \ + data_type_prefix, \ + SSE2NEON_CAT(size, \ + SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \ + vec_b[number_of_lanes]; \ + __m128i mask = SSE2NEON_IIF(byte_or_word)( \ + vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \ + vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \ + SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \ + SSE2NEON_CAT(type_prefix, size))) \ + for (int i = 0; i < number_of_lanes; i++) { \ + mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \ + size)(SSE2NEON_CAT(vbslq_u, size)( \ + SSE2NEON_CAT(vreinterpretq_u, \ + SSE2NEON_CAT(size, _m128i))(mask), \ + SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \ + vec_b[i], \ + SSE2NEON_CAT( \ + vreinterpretq_, \ + SSE2NEON_CAT(type_prefix, \ + SSE2NEON_CAT(size, _m128i(a))))), \ + SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \ + vec_b[i], \ + SSE2NEON_CAT( \ + vreinterpretq_, \ + SSE2NEON_CAT(type_prefix, \ + SSE2NEON_CAT(size, _m128i(a))))))); \ + } \ + } while (0) + +#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \ + do { \ + SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \ + SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \ + SSE2NEON_CAT(u, size))) \ + } while (0) + +#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ + static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ + int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_equal_any_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ + } + +#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ + static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ + int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_RANGES( \ + a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_ranges_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ + } + +#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \ + static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ + __m128i b, int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_equal_ordered_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \ + } + +static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x16_t vec = vcombine_u8(t_lo, t_hi); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u8( + vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u8( + vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); + int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; + res |= (tmp << j); + } + return res; +} + +static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint16x8_t vec = + vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u16( + vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u16( + vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); + int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; + res |= (tmp << j); + } + return res; +} + +/* clang-format off */ +#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \ + prefix##IMPL(byte) \ + prefix##IMPL(word) +/* clang-format on */ + +SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_) + +static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint16x8_t vec = + vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u16( + vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u16( + vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); + __m128i tmp = vreinterpretq_m128i_u32( + vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16)); + uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), + vreinterpretq_u32_m128i(tmp)); +#if defined(__aarch64__) || defined(_M_ARM64) + int t = vaddvq_u32(vec_res) ? 1 : 0; +#else + uint64x2_t sumh = vpaddlq_u32(vec_res); + int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); +#endif + res |= (t << j); + } + return res; +} + +static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x16_t vec = vcombine_u8(t_lo, t_hi); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u8( + vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u8( + vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); + __m128i tmp = vreinterpretq_m128i_u16( + vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); + uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), + vreinterpretq_u16_m128i(tmp)); + int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; + res |= (t << j); + } + return res; +} + +#define SSE2NEON_CMP_RANGES_IS_BYTE 1 +#define SSE2NEON_CMP_RANGES_IS_WORD 0 + +/* clang-format off */ +#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \ + prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \ + prefix##IMPL(byte, int, s, prefix##IS_BYTE) \ + prefix##IMPL(word, uint, u, prefix##IS_WORD) \ + prefix##IMPL(word, int, s, prefix##IS_WORD) +/* clang-format on */ + +SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_) + +#undef SSE2NEON_CMP_RANGES_IS_BYTE +#undef SSE2NEON_CMP_RANGES_IS_WORD + +static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) +{ + uint8x16_t mtx = + vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); + int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); + int m1 = 0x10000 - (1 << la); + int tb = 0x10000 - (1 << lb); + uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; + uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; + vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); + vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); + vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); + vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); + tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); + tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); + + res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); + res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); + res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo); + res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi); + res_lo = vand_u8(res_lo, vec_mask); + res_hi = vand_u8(res_hi, vec_mask); + + int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8); + return res; +} + +static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) +{ + uint16x8_t mtx = + vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); + int m1 = 0x100 - (1 << la); + int tb = 0x100 - (1 << lb); + uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); + uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); + uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); + uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask); + mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx); + mtx = vbslq_u16(vec1, tmp, mtx); + mtx = vandq_u16(mtx, vec_mask); + return _sse2neon_vaddvq_u16(mtx); +} + +#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1 +#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 + +#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ + static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ + int bound, int la, int lb, __m128i mtx[16]) \ + { \ + int res = 0; \ + int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \ + uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \ + vld1_u##size(_sse2neon_cmpestr_mask##size##b), \ + vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \ + uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \ + vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ + vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ + vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ + uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ + uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ + for (int j = 0; j < lb; j++) { \ + mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \ + vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \ + } \ + for (int j = lb; j < bound; j++) { \ + mtx[j] = vreinterpretq_m128i_u##size( \ + vbslq_u##size(vec1, vec_minusone, vec_zero)); \ + } \ + unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \ + (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \ + for (int i = 0; i < bound; i++) { \ + int val = 1; \ + for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ + val &= ptr[k * bound + j]; \ + res += val << i; \ + } \ + return res; \ + } + +/* clang-format off */ +#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \ + prefix##IMPL(8, 16, prefix##IS_UBYTE) \ + prefix##IMPL(16, 8, prefix##IS_UWORD) +/* clang-format on */ + +SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_) + +#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE +#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD + +/* clang-format off */ +#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \ + prefix##IMPL(byte) \ + prefix##IMPL(word) +/* clang-format on */ + +SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_) + +#define SSE2NEON_CMPESTR_LIST \ + _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \ + _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \ + _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \ + _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \ + _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \ + _(CMP_UWORD_RANGES, cmp_uword_ranges) \ + _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \ + _(CMP_SWORD_RANGES, cmp_sword_ranges) \ + _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \ + _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \ + _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \ + _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \ + _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ + _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \ + _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ + _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered) + +enum { +#define _(name, func_suffix) name, + SSE2NEON_CMPESTR_LIST +#undef _ +}; +typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); +static cmpestr_func_t _sse2neon_cmpfunc_table[] = { +#define _(name, func_suffix) _sse2neon_##func_suffix, + SSE2NEON_CMPESTR_LIST +#undef _ +}; + +FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) +{ + switch (imm8 & 0x30) { + case _SIDD_NEGATIVE_POLARITY: + res ^= 0xffffffff; + break; + case _SIDD_MASKED_NEGATIVE_POLARITY: + res ^= (1 << lb) - 1; + break; + default: + break; + } + + return res & ((bound == 8) ? 0xFF : 0xFFFF); +} + +FORCE_INLINE int _sse2neon_clz(unsigned int x) +{ +#ifdef _MSC_VER + unsigned long cnt = 0; + if (_BitScanReverse(&cnt, x)) + return 31 - cnt; + return 32; +#else + return x != 0 ? __builtin_clz(x) : 32; +#endif +} + +FORCE_INLINE int _sse2neon_ctz(unsigned int x) +{ +#ifdef _MSC_VER + unsigned long cnt = 0; + if (_BitScanForward(&cnt, x)) + return cnt; + return 32; +#else + return x != 0 ? __builtin_ctz(x) : 32; +#endif +} + +FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) +{ +#ifdef _MSC_VER + unsigned long cnt; +#if defined(SSE2NEON_HAS_BITSCAN64) + if (_BitScanForward64(&cnt, x)) + return (int) (cnt); +#else + if (_BitScanForward(&cnt, (unsigned long) (x))) + return (int) cnt; + if (_BitScanForward(&cnt, (unsigned long) (x >> 32))) + return (int) (cnt + 32); +#endif /* SSE2NEON_HAS_BITSCAN64 */ + return 64; +#else /* assume GNU compatible compilers */ + return x != 0 ? __builtin_ctzll(x) : 64; +#endif +} + +#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y) + +#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \ + const int var = (imm & 0x01) ? 8 : 16 + +#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \ + int tmp1 = la ^ (la >> 31); \ + la = tmp1 - (la >> 31); \ + int tmp2 = lb ^ (lb >> 31); \ + lb = tmp2 - (lb >> 31); \ + la = SSE2NEON_MIN(la, bound); \ + lb = SSE2NEON_MIN(lb, bound) + +// Compare all pairs of character in string a and b, +// then aggregate the result. +// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the +// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of +// string a and b. +#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ + SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ + int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ + r2 = _sse2neon_sido_negative(r2, lb, imm8, bound) + +#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ + return (r2 == 0) ? bound \ + : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \ + : _sse2neon_ctz(r2)) + +#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \ + __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + if (imm8 & 0x40) { \ + if (bound == 8) { \ + uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \ + vld1q_u16(_sse2neon_cmpestr_mask16b)); \ + dst = vreinterpretq_m128i_u16(vbslq_u16( \ + tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ + } else { \ + uint8x16_t vec_r2 = \ + vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ + uint8x16_t tmp = \ + vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \ + dst = vreinterpretq_m128i_u8( \ + vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \ + } \ + } else { \ + if (bound == 16) { \ + dst = vreinterpretq_m128i_u16( \ + vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ + } else { \ + dst = vreinterpretq_m128i_u8( \ + vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ + } \ + } \ + return dst + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and returns 1 if b did not contain a null character and the +// resulting mask was zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra +FORCE_INLINE int _mm_cmpestra(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + int lb_cpy = lb; + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + return !r2 & (lb_cpy > bound); +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc +FORCE_INLINE int _mm_cmpestrc(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + return r2 != 0; +} + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and store the generated index in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri +FORCE_INLINE int _mm_cmpestri(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); +} + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and store the generated mask in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm +FORCE_INLINE __m128i +_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + SSE2NEON_CMPSTR_GENERATE_MASK(dst); +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns bit 0 of the resulting bit mask. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro +FORCE_INLINE int _mm_cmpestro(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + return r2 & 1; +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if any character in a was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs +FORCE_INLINE int _mm_cmpestrs(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + (void) a; + (void) b; + (void) lb; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + return la <= (bound - 1); +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if any character in b was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz +FORCE_INLINE int _mm_cmpestrz(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + (void) a; + (void) b; + (void) la; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + return lb <= (bound - 1); +} + +#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \ + do { \ + if (imm8 & 0x01) { \ + uint16x8_t equal_mask_##str = \ + vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \ + uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ + uint64_t matches_##str = \ + vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ + len = _sse2neon_ctzll(matches_##str) >> 3; \ + } else { \ + uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \ + vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \ + uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ + uint64_t matches_##str = \ + vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ + len = _sse2neon_ctzll(matches_##str) >> 2; \ + } \ + } while (0) + +#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \ + int la, lb; \ + do { \ + SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \ + SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \ + } while (0) + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if b did not contain a null character and the resulting +// mask was zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra +FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + return !r2 & (lb >= bound); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc +FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + return r2 != 0; +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and store the generated index in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri +FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and store the generated mask in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm +FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + SSE2NEON_CMPSTR_GENERATE_MASK(dst); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns bit 0 of the resulting bit mask. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro +FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + return r2 & 1; +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if any character in a was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs +FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) +{ + (void) b; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + int la; + SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); + return la <= (bound - 1); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if any character in b was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz +FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) +{ + (void) a; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + int lb; + SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); + return lb <= (bound - 1); +} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + return vreinterpretq_m128i_s64(vshrq_n_s64( + vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), + 63)); +#endif +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16 +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ + (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32ch(crc, v); +#else + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32 +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ + (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32cw(crc, v); +#else + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64 +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32cd((uint32_t) crc, v); +#else + crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8 +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ + (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32cb(crc, v); +#else + crc ^= v; +#if defined(__ARM_FEATURE_CRYPTO) + // Adapted from: https://mary.rs/lab/crc32/ + // Barrent reduction + uint64x2_t orig = + vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0)); + uint64x2_t tmp = orig; + + // Polynomial P(x) of CRC32C + uint64_t p = 0x105EC76F1; + // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor + // 2^{64} / P(x) \rfloor = 0x11f91caf6 + uint64_t mu = 0x1dea713f1; + + // Multiply by mu_{64} + tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu)); + // Divide by 2^{64} (mask away the unnecessary bits) + tmp = + vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0))); + // Multiply by P(x) (shifted left by 1 for alignment reasons) + tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p)); + // Subtract original from result + tmp = veorq_u64(tmp, orig); + + // Extract the 'lower' (in bit-reflected sense) 32 bits + crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1); +#else // Fall back to the generic table lookup approach + // Adapted from: https://create.stephan-brumme.com/crc32/ + // Apply half-byte comparision algorithm for the best ratio between + // performance and lookup table. + + // The lookup table just needs to store every 16th entry + // of the standard look-up table. + static const uint32_t crc32_half_byte_tbl[] = { + 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3, + 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9, + 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75, + }; + + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; +#endif +#endif + return crc; +} + +/* AES */ + +#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__)) +/* clang-format off */ +#define SSE2NEON_AES_SBOX(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ + w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ + w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ + w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ + w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ + w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ + w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ + w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ + w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ + w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ + w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ + w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ + w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ + w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ + w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ + w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ + w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ + w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ + w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ + w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ + w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ + w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ + w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ + w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ + w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ + w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ + w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ + w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ + w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ + w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ + w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ + w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ + w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } +#define SSE2NEON_AES_RSBOX(w) \ + { \ + w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \ + w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \ + w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \ + w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \ + w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \ + w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \ + w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \ + w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \ + w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \ + w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \ + w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \ + w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \ + w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \ + w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \ + w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \ + w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \ + w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \ + w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \ + w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \ + w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \ + w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \ + w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \ + w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \ + w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \ + w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \ + w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \ + w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \ + w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \ + w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \ + w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \ + w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \ + w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \ + w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \ + w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \ + w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \ + w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \ + w(0x55), w(0x21), w(0x0c), w(0x7d) \ + } +/* clang-format on */ + +/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ +#define SSE2NEON_AES_H0(x) (x) +static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0); +static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); +#undef SSE2NEON_AES_H0 + +/* x_time function and matrix multiply function */ +#if !defined(__aarch64__) && !defined(_M_ARM64) +#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) +#define SSE2NEON_MULTIPLY(x, y) \ + (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \ + ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \ + ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \ + ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))))) +#endif + +// In the absence of crypto extensions, implement aesenc using regular NEON +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// for more information. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + static const uint8_t shift_rows[] = { + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, + }; + static const uint8_t ror32by8[] = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + /* shift rows */ + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + /* sub bytes */ + // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and + // look up each of the table. After each lookup, we load the next table + // which locates at the next 64-bytes. In the meantime, the index in the + // table would be smaller than it was, so the index parameters of + // `vqtbx4q_u8()` need to be added the same constant as the loaded tables. + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); + // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))' + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); + + /* mix columns */ + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + /* add round key */ + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A implementation for a table-based AES */ +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \ + ((uint32_t) (b1) << 8) | (uint32_t) (b0)) +// muliplying 'x' by 2 in GF(2^8) +#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) +// muliplying 'x' by 3 in GF(2^8) +#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) +#define SSE2NEON_AES_U0(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) +#define SSE2NEON_AES_U1(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) +#define SSE2NEON_AES_U2(p) \ + SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) +#define SSE2NEON_AES_U3(p) \ + SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) + + // this generates a table containing every possible permutation of + // shift_rows() and sub_bytes() with mix_columns(). + static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { + SSE2NEON_AES_SBOX(SSE2NEON_AES_U0), + SSE2NEON_AES_SBOX(SSE2NEON_AES_U1), + SSE2NEON_AES_SBOX(SSE2NEON_AES_U2), + SSE2NEON_AES_SBOX(SSE2NEON_AES_U3), + }; +#undef SSE2NEON_AES_B2W +#undef SSE2NEON_AES_F2 +#undef SSE2NEON_AES_F3 +#undef SSE2NEON_AES_U0 +#undef SSE2NEON_AES_U1 +#undef SSE2NEON_AES_U2 +#undef SSE2NEON_AES_U3 + + uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0] + uint32_t x1 = + _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32] + uint32_t x2 = + _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64] + uint32_t x3 = + _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96] + + // finish the modulo addition step in mix_columns() + __m128i out = _mm_set_epi32( + (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ + aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), + (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ + aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), + (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ + aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), + (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ + aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, RoundKey); +#endif +} + +// Perform one round of an AES decryption flow on data (state) in a using the +// round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 +FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t inv_shift_rows[] = { + 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, + 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, + }; + static const uint8_t ror32by8[] = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + // inverse shift rows + w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); + + // inverse sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); + + // inverse mix columns + // multiplying 'v' by 4 in GF(2^8) + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); + v ^= w; + v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); + + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & + 0x1b); // muliplying 'v' by 2 in GF(2^8) + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + // add round key + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ + /* FIXME: optimized for NEON */ + uint8_t i, e, f, g, h, v[4][4]; + uint8_t *_a = (uint8_t *) &a; + for (i = 0; i < 16; ++i) { + v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; + } + + // inverse mix columns + for (i = 0; i < 4; ++i) { + e = v[i][0]; + f = v[i][1]; + g = v[i][2]; + h = v[i][3]; + + v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ + SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); + v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ + SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); + v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ + SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); + v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ + SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); + } + + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; +#endif +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t shift_rows[] = { + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + // shift rows + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + // sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); + + // add round key + return vreinterpretq_m128i_u8(v) ^ RoundKey; + +#else /* ARMv7-A implementation */ + uint8_t v[16] = { + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)], + }; + + return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey; +#endif +} + +// Perform the last round of an AES decryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 +FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t inv_shift_rows[] = { + 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, + 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + // inverse shift rows + w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); + + // inverse sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); + + // add round key + return vreinterpretq_m128i_u8(v) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ + /* FIXME: optimized for NEON */ + uint8_t v[4][4]; + uint8_t *_a = (uint8_t *) &a; + for (int i = 0; i < 16; ++i) { + v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; + } + + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; +#endif +} + +// Perform the InvMixColumns transformation on a and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 +FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) +{ +#if defined(__aarch64__) + static const uint8_t ror32by8[] = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, + }; + uint8x16_t v = vreinterpretq_u8_m128i(a); + uint8x16_t w; + + // multiplying 'v' by 4 in GF(2^8) + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); + v ^= w; + v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); + + // multiplying 'v' by 2 in GF(2^8) + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + return vreinterpretq_m128i_u8(w); + +#else /* ARMv7-A NEON implementation */ + uint8_t i, e, f, g, h, v[4][4]; + vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a)); + for (i = 0; i < 4; ++i) { + e = v[i][0]; + f = v[i][1]; + g = v[i][2]; + h = v[i][3]; + + v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ + SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); + v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ + SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); + v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ + SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); + v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ + SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); + } + + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)); +#endif +} + +// Assist in expanding the AES cipher key by computing steps towards generating +// a round key for encryption cipher using data from a and an 8-bit round +// constant specified in imm8, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 +// +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ +#if defined(__aarch64__) + uint8x16_t _a = vreinterpretq_u8_m128i(a); + uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0); + + uint32x4_t v_u32 = vreinterpretq_u32_u8(v); + uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24)); + uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon)); + + return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v)); + +#else /* ARMv7-A NEON implementation */ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]]; + ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +#endif +} +#undef SSE2NEON_AES_SBOX +#undef SSE2NEON_AES_RSBOX + +#if defined(__aarch64__) +#undef SSE2NEON_XT +#undef SSE2NEON_MULTIPLY +#endif + +#else /* __ARM_FEATURE_CRYPTO */ +// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and +// AESMC and then manually applying the real key as an xor operation. This +// unfortunately means an additional xor op; the compiler should be able to +// optimize this away for repeated calls however. See +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +// for more details. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8(veorq_u8( + vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + vreinterpretq_u8_m128i(b))); +} + +// Perform one round of an AES decryption flow on data (state) in a using the +// round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 +FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) +{ + return vreinterpretq_m128i_u8(veorq_u8( + vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + vreinterpretq_u8_m128i(RoundKey))); +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( + vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + RoundKey); +} + +// Perform the last round of an AES decryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 +FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) +{ + return vreinterpretq_m128i_u8( + veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)), + vreinterpretq_u8_m128i(RoundKey))); +} + +// Perform the InvMixColumns transformation on a and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 +FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) +{ + return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a))); +} + +// Assist in expanding the AES cipher key by computing steps towards generating +// a round key for encryption cipher using data from a and an 8-bit round +// constant specified in imm8, and store the result in dst." +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + // AESE does ShiftRows and SubBytes on A + uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + +#ifndef _MSC_VER + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) + u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) + u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + }; + uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; + return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); +#else + // We have to do this hack because MSVC is strictly adhering to the CPP + // standard, in particular C++03 8.5.1 sub-section 15, which states that + // unions must be initialized by their first member type. + + // As per the Windows ARM64 ABI, it is always little endian, so this works + __n128 dest{ + ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) | + ((uint64_t) u8.n128_u8[0xE] << 16) | + ((uint64_t) u8.n128_u8[0xB] << 24) | + ((uint64_t) u8.n128_u8[0x1] << 32) | + ((uint64_t) u8.n128_u8[0xE] << 40) | + ((uint64_t) u8.n128_u8[0xB] << 48) | + ((uint64_t) u8.n128_u8[0x4] << 56), + ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) | + ((uint64_t) u8.n128_u8[0x6] << 16) | + ((uint64_t) u8.n128_u8[0x3] << 24) | + ((uint64_t) u8.n128_u8[0x9] << 32) | + ((uint64_t) u8.n128_u8[0x6] << 40) | + ((uint64_t) u8.n128_u8[0x3] << 48) | + ((uint64_t) u8.n128_u8[0xC] << 56)}; + + dest.n128_u32[1] = dest.n128_u32[1] ^ rcon; + dest.n128_u32[3] = dest.n128_u32[3] ^ rcon; + + return dest; +#endif +} +#endif + +/* Others */ + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128 +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) +{ + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } +} + +FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; +} + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); +#elif defined(_MSC_VER) + return _CountOneBits(a); +#else + return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); +#endif +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + + vst1_u32(&count, count32x2_val); + return count; +#endif +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); +#elif defined(_MSC_VER) + return _CountOneBits64(a); +#else + return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; +#endif +} + +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) +{ + // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, + // regardless of the value of the FZ bit. + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; + +#if defined(__aarch64__) || defined(_M_ARM64) + _sse2neon_set_fpcr(r.value); +#else + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +// Return the current 64-bit value of the processor's time-stamp counter. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc +FORCE_INLINE uint64_t _rdtsc(void) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t val; + + /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the + * system counter is at least 56 bits wide; from Armv8.6, the counter + * must be 64 bits wide. So the system counter could be less than 64 + * bits wide and it is attributed with the flag 'cap_user_time_short' + * is true. + */ +#if defined(_MSC_VER) + val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); +#else + __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); +#endif + + return val; +#else + uint32_t pmccntr, pmuseren, pmcntenset; + // Read the user mode Performance Monitoring Unit (PMU) + // User Enable Register (PMUSERENR) access permissions. + __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); + if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code. + __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); + if (pmcntenset & 0x80000000UL) { // Is it counting? + __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); + // The counter is set up to count every 64th cycle + return (uint64_t) (pmccntr) << 6; + } + } + + // Fallback to syscall as we can't enable PMUSERENR in user mode. + struct timeval tv; + gettimeofday(&tv, NULL); + return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec; +#endif +} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif + +#endif diff --git a/externals/xbyak b/externals/xbyak index a1ac3750f..9c0f5d3ec 160000 --- a/externals/xbyak +++ b/externals/xbyak @@ -1 +1 @@ -Subproject commit a1ac3750f9a639b5a6c6d6c7da4259b8d6790989 +Subproject commit 9c0f5d3ecb06d2c93c2b59becb9b3b763213e74e diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cf05a3fe3..697a9ab97 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -165,6 +165,7 @@ else() if (MINGW) add_definitions(-DMINGW_HAS_SECURE_API) + add_compile_options("-msse4.1") if (MINGW_STATIC_BUILD) add_definitions(-DQT_STATICPLUGIN) diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp index 7a267f8c0..f9f7df74e 100644 --- a/src/common/logging/backend.cpp +++ b/src/common/logging/backend.cpp @@ -343,7 +343,7 @@ void SetColorConsoleBackendEnabled(bool enabled) { } void FmtLogMessageImpl(Class log_class, Level log_level, const char* filename, - unsigned int line_num, const char* function, const char* format, + unsigned int line_num, const char* function, fmt::string_view format, const fmt::format_args& args) { if (!initialization_in_progress_suppress_logging) { Impl::Instance().PushEntry(log_class, log_level, filename, line_num, function, diff --git a/src/common/logging/log.h b/src/common/logging/log.h index c00c01a9e..79f710265 100644 --- a/src/common/logging/log.h +++ b/src/common/logging/log.h @@ -24,12 +24,12 @@ constexpr const char* TrimSourcePath(std::string_view source) { /// Logs a message to the global logger, using fmt void FmtLogMessageImpl(Class log_class, Level log_level, const char* filename, - unsigned int line_num, const char* function, const char* format, + unsigned int line_num, const char* function, fmt::string_view format, const fmt::format_args& args); template void FmtLogMessage(Class log_class, Level log_level, const char* filename, unsigned int line_num, - const char* function, const char* format, const Args&... args) { + const char* function, fmt::format_string format, const Args&... args) { FmtLogMessageImpl(log_class, log_level, filename, line_num, function, format, fmt::make_format_args(args...)); } diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index de158eea7..c7f4df802 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -401,14 +401,16 @@ add_library(core STATIC hle/service/am/am_types.h hle/service/am/applet.cpp hle/service/am/applet.h + hle/service/am/applet_manager.cpp hle/service/am/applet_data_broker.cpp hle/service/am/applet_data_broker.h - hle/service/am/applet_manager.cpp hle/service/am/applet_manager.h - hle/service/am/applet_message_queue.cpp - hle/service/am/applet_message_queue.h + hle/service/am/button_poller.cpp + hle/service/am/button_poller.h hle/service/am/display_layer_manager.cpp hle/service/am/display_layer_manager.h + hle/service/am/event_observer.cpp + hle/service/am/event_observer.h hle/service/am/frontend/applet_cabinet.cpp hle/service/am/frontend/applet_cabinet.h hle/service/am/frontend/applet_controller.cpp @@ -434,8 +436,12 @@ add_library(core STATIC hle/service/am/hid_registration.h hle/service/am/library_applet_storage.cpp hle/service/am/library_applet_storage.h - hle/service/am/process.cpp - hle/service/am/process.h + hle/service/am/lifecycle_manager.cpp + hle/service/am/lifecycle_manager.h + hle/service/am/process_creation.cpp + hle/service/am/process_creation.h + hle/service/am/process_holder.cpp + hle/service/am/process_holder.h hle/service/am/service/all_system_applet_proxies_service.cpp hle/service/am/service/all_system_applet_proxies_service.h hle/service/am/service/applet_common_functions.cpp @@ -486,6 +492,8 @@ add_library(core STATIC hle/service/am/service/system_applet_proxy.h hle/service/am/service/window_controller.cpp hle/service/am/service/window_controller.h + hle/service/am/window_system.cpp + hle/service/am/window_system.h hle/service/aoc/addon_content_manager.cpp hle/service/aoc/addon_content_manager.h hle/service/aoc/purchase_event_manager.cpp @@ -918,6 +926,8 @@ add_library(core STATIC hle/service/os/multi_wait_utils.h hle/service/os/mutex.cpp hle/service/os/mutex.h + hle/service/os/process.cpp + hle/service/os/process.h hle/service/pcie/pcie.cpp hle/service/pcie/pcie.h hle/service/pctl/parental_control_service_factory.cpp diff --git a/src/core/core.cpp b/src/core/core.cpp index dc515bc82..e1c8b41ee 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include @@ -20,7 +19,6 @@ #include "core/cpu_manager.h" #include "core/debugger/debugger.h" #include "core/device_memory.h" -#include "core/file_sys/bis_factory.h" #include "core/file_sys/fs_filesystem.h" #include "core/file_sys/patch_manager.h" #include "core/file_sys/registered_cache.h" @@ -38,6 +36,7 @@ #include "core/hle/service/acc/profile_manager.h" #include "core/hle/service/am/applet_manager.h" #include "core/hle/service/am/frontend/applets.h" +#include "core/hle/service/am/process_creation.h" #include "core/hle/service/apm/apm_controller.h" #include "core/hle/service/filesystem/filesystem.h" #include "core/hle/service/glue/glue_manager.h" @@ -72,30 +71,6 @@ MICROPROFILE_DEFINE(ARM_CPU3, "ARM", "CPU 3", MP_RGB(255, 64, 64)); namespace Core { -namespace { - -FileSys::StorageId GetStorageIdForFrontendSlot( - std::optional slot) { - if (!slot.has_value()) { - return FileSys::StorageId::None; - } - - switch (*slot) { - case FileSys::ContentProviderUnionSlot::UserNAND: - return FileSys::StorageId::NandUser; - case FileSys::ContentProviderUnionSlot::SysNAND: - return FileSys::StorageId::NandSystem; - case FileSys::ContentProviderUnionSlot::SDMC: - return FileSys::StorageId::SdCard; - case FileSys::ContentProviderUnionSlot::FrontendManual: - return FileSys::StorageId::Host; - default: - return FileSys::StorageId::None; - } -} - -} // Anonymous namespace - FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs, const std::string& path) { // To account for split 00+01+etc files. @@ -297,9 +272,6 @@ struct System::Impl { } SystemResultStatus SetupForApplicationProcess(System& system, Frontend::EmuWindow& emu_window) { - /// Reset all glue registrations - arp_manager.ResetAll(); - telemetry_session = std::make_unique(); host1x_core = std::make_unique(system); @@ -335,33 +307,17 @@ struct System::Impl { SystemResultStatus Load(System& system, Frontend::EmuWindow& emu_window, const std::string& filepath, Service::AM::FrontendAppletParameters& params) { - app_loader = Loader::GetLoader(system, GetGameFileFromPath(virtual_filesystem, filepath), - params.program_id, params.program_index); - - if (!app_loader) { - LOG_CRITICAL(Core, "Failed to obtain loader for {}!", filepath); - return SystemResultStatus::ErrorGetLoader; - } - - if (app_loader->ReadProgramId(params.program_id) != Loader::ResultStatus::Success) { - LOG_ERROR(Core, "Failed to find title id for ROM!"); - } - - std::string name = "Unknown program"; - if (app_loader->ReadTitle(name) != Loader::ResultStatus::Success) { - LOG_ERROR(Core, "Failed to read title for ROM!"); - } - - LOG_INFO(Core, "Loading {} ({})", name, params.program_id); - InitializeKernel(system); - // Create the application process. - auto main_process = Kernel::KProcess::Create(system.Kernel()); - Kernel::KProcess::Register(system.Kernel(), main_process); - kernel.AppendNewProcess(main_process); - kernel.MakeApplicationProcess(main_process); - const auto [load_result, load_parameters] = app_loader->Load(*main_process, system); + const auto file = GetGameFileFromPath(virtual_filesystem, filepath); + + // Create the application process + Loader::ResultStatus load_result{}; + std::vector control; + auto process = + Service::AM::CreateApplicationProcess(control, app_loader, load_result, system, file, + params.program_id, params.program_index); + if (load_result != Loader::ResultStatus::Success) { LOG_CRITICAL(Core, "Failed to load ROM (Error {})!", load_result); ShutdownMainProcess(); @@ -370,6 +326,25 @@ struct System::Impl { static_cast(SystemResultStatus::ErrorLoader) + static_cast(load_result)); } + if (!app_loader) { + LOG_CRITICAL(Core, "Failed to obtain loader for {}!", filepath); + return SystemResultStatus::ErrorGetLoader; + } + + if (app_loader->ReadProgramId(params.program_id) != Loader::ResultStatus::Success) { + LOG_ERROR(Core, "Failed to find program id for ROM!"); + } + + std::string name = "Unknown program"; + if (app_loader->ReadTitle(name) != Loader::ResultStatus::Success) { + LOG_ERROR(Core, "Failed to read title for ROM!"); + } + + LOG_INFO(Core, "Loading {} ({:016X}) ...", name, params.program_id); + + // Make the process created be the application + kernel.MakeApplicationProcess(process->GetHandle()); + // Set up the rest of the system. SystemResultStatus init_result{SetupForApplicationProcess(system, emu_window)}; if (init_result != SystemResultStatus::Success) { @@ -379,7 +354,6 @@ struct System::Impl { return init_result; } - AddGlueRegistrationForProcess(*app_loader, *main_process); telemetry_session->AddInitialInfo(*app_loader, fs_controller, *content_provider); // Initialize cheat engine @@ -387,14 +361,9 @@ struct System::Impl { cheat_engine->Initialize(); } - // Register with applet manager. - applet_manager.CreateAndInsertByFrontendAppletParameters(main_process->GetProcessId(), - params); - - // All threads are started, begin main process execution, now that we're in the clear. - main_process->Run(load_parameters->main_thread_priority, - load_parameters->main_thread_stack_size); - main_process->Close(); + // Register with applet manager + // All threads are started, begin main process execution, now that we're in the clear + applet_manager.CreateAndInsertByFrontendAppletParameters(std::move(process), params); if (Settings::values.gamecard_inserted) { if (Settings::values.gamecard_current_game) { @@ -461,7 +430,6 @@ struct System::Impl { kernel.SuspendEmulation(true); kernel.CloseServices(); kernel.ShutdownCores(); - applet_manager.Reset(); services.reset(); service_manager.reset(); fs_controller.Reset(); @@ -484,6 +452,9 @@ struct System::Impl { room_member->SendGameInfo(game_info); } + // Reset all glue registrations + arp_manager.ResetAll(); + LOG_DEBUG(Core, "Shutdown OK"); } @@ -501,31 +472,6 @@ struct System::Impl { return app_loader->ReadTitle(out); } - void AddGlueRegistrationForProcess(Loader::AppLoader& loader, Kernel::KProcess& process) { - std::vector nacp_data; - FileSys::NACP nacp; - if (loader.ReadControlData(nacp) == Loader::ResultStatus::Success) { - nacp_data = nacp.GetRawBytes(); - } else { - nacp_data.resize(sizeof(FileSys::RawNACP)); - } - - Service::Glue::ApplicationLaunchProperty launch{}; - launch.title_id = process.GetProgramId(); - - FileSys::PatchManager pm{launch.title_id, fs_controller, *content_provider}; - launch.version = pm.GetGameVersion().value_or(0); - - // TODO(DarkLordZach): When FSController/Game Card Support is added, if - // current_process_game_card use correct StorageId - launch.base_game_storage_id = GetStorageIdForFrontendSlot(content_provider->GetSlotForEntry( - launch.title_id, FileSys::ContentRecordType::Program)); - launch.update_storage_id = GetStorageIdForFrontendSlot(content_provider->GetSlotForEntry( - FileSys::GetUpdateTitleID(launch.title_id), FileSys::ContentRecordType::Program)); - - arp_manager.Register(launch.title_id, launch, std::move(nacp_data)); - } - void SetStatus(SystemResultStatus new_status, const char* details = nullptr) { status = new_status; if (details) { diff --git a/src/core/device_memory_manager.h b/src/core/device_memory_manager.h index 0568a821b..6dcf7bb22 100644 --- a/src/core/device_memory_manager.h +++ b/src/core/device_memory_manager.h @@ -43,6 +43,8 @@ public: DeviceMemoryManager(const DeviceMemory& device_memory); ~DeviceMemoryManager(); + static constexpr bool HAS_FLUSH_INVALIDATION = true; + void BindInterface(DeviceInterface* device_inter); DAddr Allocate(size_t size); diff --git a/src/core/file_sys/partition_filesystem.cpp b/src/core/file_sys/partition_filesystem.cpp index dd8de9d8a..8ad2307cb 100644 --- a/src/core/file_sys/partition_filesystem.cpp +++ b/src/core/file_sys/partition_filesystem.cpp @@ -105,12 +105,4 @@ VirtualDir PartitionFilesystem::GetParentDirectory() const { return nullptr; } -void PartitionFilesystem::PrintDebugInfo() const { - LOG_DEBUG(Service_FS, "Magic: {:.4}", pfs_header.magic); - LOG_DEBUG(Service_FS, "Files: {}", pfs_header.num_entries); - for (u32 i = 0; i < pfs_header.num_entries; i++) { - LOG_DEBUG(Service_FS, " > File {}: {} (0x{:X} bytes)", i, - pfs_files[i]->GetName(), pfs_files[i]->GetSize()); - } -} } // namespace FileSys diff --git a/src/core/file_sys/partition_filesystem.h b/src/core/file_sys/partition_filesystem.h index 777b9ead9..c64b29b5d 100644 --- a/src/core/file_sys/partition_filesystem.h +++ b/src/core/file_sys/partition_filesystem.h @@ -35,7 +35,6 @@ public: std::vector GetSubdirectories() const override; std::string GetName() const override; VirtualDir GetParentDirectory() const override; - void PrintDebugInfo() const; private: struct Header { diff --git a/src/core/file_sys/vfs/vfs_offset.cpp b/src/core/file_sys/vfs/vfs_offset.cpp index 1a37d2670..6b51ed3eb 100644 --- a/src/core/file_sys/vfs/vfs_offset.cpp +++ b/src/core/file_sys/vfs/vfs_offset.cpp @@ -9,9 +9,8 @@ namespace FileSys { OffsetVfsFile::OffsetVfsFile(VirtualFile file_, std::size_t size_, std::size_t offset_, - std::string name_, VirtualDir parent_) - : file(file_), offset(offset_), size(size_), name(std::move(name_)), - parent(parent_ == nullptr ? file->GetContainingDirectory() : std::move(parent_)) {} + std::string name_) + : file(file_), offset(offset_), size(size_), name(std::move(name_)) {} OffsetVfsFile::~OffsetVfsFile() = default; @@ -37,7 +36,7 @@ bool OffsetVfsFile::Resize(std::size_t new_size) { } VirtualDir OffsetVfsFile::GetContainingDirectory() const { - return parent; + return nullptr; } bool OffsetVfsFile::IsWritable() const { diff --git a/src/core/file_sys/vfs/vfs_offset.h b/src/core/file_sys/vfs/vfs_offset.h index 4abe41d8e..9800b4fd0 100644 --- a/src/core/file_sys/vfs/vfs_offset.h +++ b/src/core/file_sys/vfs/vfs_offset.h @@ -16,7 +16,7 @@ namespace FileSys { class OffsetVfsFile : public VfsFile { public: OffsetVfsFile(VirtualFile file, std::size_t size, std::size_t offset = 0, - std::string new_name = "", VirtualDir new_parent = nullptr); + std::string new_name = ""); ~OffsetVfsFile() override; std::string GetName() const override; @@ -44,7 +44,6 @@ private: std::size_t offset; std::size_t size; std::string name; - VirtualDir parent; }; } // namespace FileSys diff --git a/src/core/file_sys/vfs/vfs_real.cpp b/src/core/file_sys/vfs/vfs_real.cpp index 3ad073e4a..052684e9d 100644 --- a/src/core/file_sys/vfs/vfs_real.cpp +++ b/src/core/file_sys/vfs/vfs_real.cpp @@ -76,6 +76,7 @@ VfsEntryType RealVfsFilesystem::GetEntryType(std::string_view path_) const { } VirtualFile RealVfsFilesystem::OpenFileFromEntry(std::string_view path_, std::optional size, + std::optional parent_path, OpenMode perms) { const auto path = FS::SanitizePath(path_, FS::DirectorySeparator::PlatformDefault); std::scoped_lock lk{list_lock}; @@ -94,14 +95,14 @@ VirtualFile RealVfsFilesystem::OpenFileFromEntry(std::string_view path_, std::op this->InsertReferenceIntoListLocked(*reference); auto file = std::shared_ptr( - new RealVfsFile(*this, std::move(reference), path, perms, size)); + new RealVfsFile(*this, std::move(reference), path, perms, size, std::move(parent_path))); cache[path] = file; return file; } VirtualFile RealVfsFilesystem::OpenFile(std::string_view path_, OpenMode perms) { - return OpenFileFromEntry(path_, {}, perms); + return OpenFileFromEntry(path_, {}, {}, perms); } VirtualFile RealVfsFilesystem::CreateFile(std::string_view path_, OpenMode perms) { @@ -268,10 +269,11 @@ void RealVfsFilesystem::RemoveReferenceFromListLocked(FileReference& reference) } RealVfsFile::RealVfsFile(RealVfsFilesystem& base_, std::unique_ptr reference_, - const std::string& path_, OpenMode perms_, std::optional size_) + const std::string& path_, OpenMode perms_, std::optional size_, + std::optional parent_path_) : base(base_), reference(std::move(reference_)), path(path_), - parent_path(FS::GetParentPath(path_)), path_components(FS::SplitPathComponentsCopy(path_)), - size(size_), perms(perms_) {} + parent_path(parent_path_ ? std::move(*parent_path_) : FS::GetParentPath(path_)), + path_components(FS::SplitPathComponentsCopy(path_)), size(size_), perms(perms_) {} RealVfsFile::~RealVfsFile() { base.DropReference(std::move(reference)); @@ -348,7 +350,7 @@ std::vector RealVfsDirectory::IterateEntries( &out](const std::filesystem::directory_entry& entry) { const auto full_path_string = FS::PathToUTF8String(entry.path()); - out.emplace_back(base.OpenFileFromEntry(full_path_string, entry.file_size(), perms)); + out.emplace_back(base.OpenFileFromEntry(full_path_string, entry.file_size(), path, perms)); return true; }; diff --git a/src/core/file_sys/vfs/vfs_real.h b/src/core/file_sys/vfs/vfs_real.h index 5c2172cce..a773fc375 100644 --- a/src/core/file_sys/vfs/vfs_real.h +++ b/src/core/file_sys/vfs/vfs_real.h @@ -62,6 +62,7 @@ private: private: friend class RealVfsDirectory; VirtualFile OpenFileFromEntry(std::string_view path, std::optional size, + std::optional parent_path, OpenMode perms = OpenMode::Read); private: @@ -91,7 +92,7 @@ public: private: RealVfsFile(RealVfsFilesystem& base, std::unique_ptr reference, const std::string& path, OpenMode perms = OpenMode::Read, - std::optional size = {}); + std::optional size = {}, std::optional parent_path = {}); RealVfsFilesystem& base; std::unique_ptr reference; diff --git a/src/core/guest_memory.h b/src/core/guest_memory.h index 7ee18c126..83292f702 100644 --- a/src/core/guest_memory.h +++ b/src/core/guest_memory.h @@ -44,15 +44,32 @@ public: GuestMemory() = delete; explicit GuestMemory(M& memory, u64 addr, std::size_t size, Common::ScratchBuffer* backup = nullptr) - : m_memory{memory}, m_addr{addr}, m_size{size} { + : m_memory{&memory}, m_addr{addr}, m_size{size} { static_assert(FLAGS & GuestMemoryFlags::Read || FLAGS & GuestMemoryFlags::Write); - if constexpr (FLAGS & GuestMemoryFlags::Read) { + if constexpr (!(FLAGS & GuestMemoryFlags::Read)) { + if (!this->TrySetSpan()) { + if (backup) { + backup->resize_destructive(this->size()); + m_data_span = *backup; + m_span_valid = true; + m_is_data_copy = true; + } else { + m_data_copy.resize(this->size()); + m_data_span = std::span(m_data_copy); + m_span_valid = true; + m_is_data_copy = true; + } + } + } else if constexpr (FLAGS & GuestMemoryFlags::Read) { Read(addr, size, backup); } } ~GuestMemory() = default; + GuestMemory(GuestMemory&& rhs) = default; + GuestMemory& operator=(GuestMemory&& rhs) = default; + T* data() noexcept { return m_data_span.data(); } @@ -109,8 +126,8 @@ public: } if (this->TrySetSpan()) { - if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory.FlushRegion(m_addr, this->size_bytes()); + if constexpr (FLAGS & GuestMemoryFlags::Safe && M::HAS_FLUSH_INVALIDATION) { + m_memory->FlushRegion(m_addr, this->size_bytes()); } } else { if (backup) { @@ -123,9 +140,9 @@ public: m_is_data_copy = true; m_span_valid = true; if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory.ReadBlock(m_addr, this->data(), this->size_bytes()); + m_memory->ReadBlock(m_addr, this->data(), this->size_bytes()); } else { - m_memory.ReadBlockUnsafe(m_addr, this->data(), this->size_bytes()); + m_memory->ReadBlockUnsafe(m_addr, this->data(), this->size_bytes()); } } return m_data_span; @@ -133,18 +150,19 @@ public: void Write(std::span write_data) noexcept { if constexpr (FLAGS & GuestMemoryFlags::Cached) { - m_memory.WriteBlockCached(m_addr, write_data.data(), this->size_bytes()); + m_memory->WriteBlockCached(m_addr, write_data.data(), this->size_bytes()); } else if constexpr (FLAGS & GuestMemoryFlags::Safe) { - m_memory.WriteBlock(m_addr, write_data.data(), this->size_bytes()); + m_memory->WriteBlock(m_addr, write_data.data(), this->size_bytes()); } else { - m_memory.WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes()); + m_memory->WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes()); } } bool TrySetSpan() noexcept { - if (u8* ptr = m_memory.GetSpan(m_addr, this->size_bytes()); ptr) { + if (u8* ptr = m_memory->GetSpan(m_addr, this->size_bytes()); ptr) { m_data_span = {reinterpret_cast(ptr), this->size()}; m_span_valid = true; + m_is_data_copy = false; return true; } return false; @@ -159,7 +177,7 @@ protected: return m_addr_changed; } - M& m_memory; + M* m_memory; u64 m_addr{}; size_t m_size{}; std::span m_data_span{}; @@ -175,17 +193,7 @@ public: GuestMemoryScoped() = delete; explicit GuestMemoryScoped(M& memory, u64 addr, std::size_t size, Common::ScratchBuffer* backup = nullptr) - : GuestMemory(memory, addr, size, backup) { - if constexpr (!(FLAGS & GuestMemoryFlags::Read)) { - if (!this->TrySetSpan()) { - if (backup) { - this->m_data_span = *backup; - this->m_span_valid = true; - this->m_is_data_copy = true; - } - } - } - } + : GuestMemory(memory, addr, size, backup) {} ~GuestMemoryScoped() { if constexpr (FLAGS & GuestMemoryFlags::Write) { @@ -196,15 +204,17 @@ public: if (this->AddressChanged() || this->IsDataCopy()) { ASSERT(this->m_span_valid); if constexpr (FLAGS & GuestMemoryFlags::Cached) { - this->m_memory.WriteBlockCached(this->m_addr, this->data(), this->size_bytes()); + this->m_memory->WriteBlockCached(this->m_addr, this->data(), + this->size_bytes()); } else if constexpr (FLAGS & GuestMemoryFlags::Safe) { - this->m_memory.WriteBlock(this->m_addr, this->data(), this->size_bytes()); + this->m_memory->WriteBlock(this->m_addr, this->data(), this->size_bytes()); } else { - this->m_memory.WriteBlockUnsafe(this->m_addr, this->data(), this->size_bytes()); + this->m_memory->WriteBlockUnsafe(this->m_addr, this->data(), + this->size_bytes()); } } else if constexpr ((FLAGS & GuestMemoryFlags::Safe) || (FLAGS & GuestMemoryFlags::Cached)) { - this->m_memory.InvalidateRegion(this->m_addr, this->size_bytes()); + this->m_memory->InvalidateRegion(this->m_addr, this->size_bytes()); } } } diff --git a/src/core/hle/kernel/k_process.cpp b/src/core/hle/kernel/k_process.cpp index cb9a11a63..80566b7e7 100644 --- a/src/core/hle/kernel/k_process.cpp +++ b/src/core/hle/kernel/k_process.cpp @@ -1170,6 +1170,7 @@ Result KProcess::LoadFromMetadata(const FileSys::ProgramMetadata& metadata, std: // Determine if we are an application. if (pool == KMemoryManager::Pool::Application) { flag |= Svc::CreateProcessFlag::IsApplication; + m_is_application = true; } // If we are 64-bit, create as such. diff --git a/src/core/hle/service/am/am.cpp b/src/core/hle/service/am/am.cpp index 8c4e14f08..2ef393439 100644 --- a/src/core/hle/service/am/am.cpp +++ b/src/core/hle/service/am/am.cpp @@ -2,19 +2,26 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "core/hle/service/am/am.h" +#include "core/hle/service/am/button_poller.h" +#include "core/hle/service/am/event_observer.h" #include "core/hle/service/am/service/all_system_applet_proxies_service.h" #include "core/hle/service/am/service/application_proxy_service.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/server_manager.h" namespace Service::AM { void LoopProcess(Core::System& system) { + WindowSystem window_system(system); + ButtonPoller button_poller(system, window_system); + EventObserver event_observer(system, window_system); + auto server_manager = std::make_unique(system); - server_manager->RegisterNamedService("appletAE", - std::make_shared(system)); - server_manager->RegisterNamedService("appletOE", - std::make_shared(system)); + server_manager->RegisterNamedService( + "appletAE", std::make_shared(system, window_system)); + server_manager->RegisterNamedService( + "appletOE", std::make_shared(system, window_system)); ServerManager::RunServer(std::move(server_manager)); } diff --git a/src/core/hle/service/am/am_results.h b/src/core/hle/service/am/am_results.h index a2afc9eec..44846aa2e 100644 --- a/src/core/hle/service/am/am_results.h +++ b/src/core/hle/service/am/am_results.h @@ -9,6 +9,7 @@ namespace Service::AM { constexpr Result ResultNoDataInChannel{ErrorModule::AM, 2}; constexpr Result ResultNoMessages{ErrorModule::AM, 3}; +constexpr Result ResultLibraryAppletTerminated{ErrorModule::AM, 22}; constexpr Result ResultInvalidOffset{ErrorModule::AM, 503}; constexpr Result ResultInvalidStorageType{ErrorModule::AM, 511}; constexpr Result ResultFatalSectionCountImbalance{ErrorModule::AM, 512}; diff --git a/src/core/hle/service/am/am_types.h b/src/core/hle/service/am/am_types.h index a14defb40..eb9ad0ac5 100644 --- a/src/core/hle/service/am/am_types.h +++ b/src/core/hle/service/am/am_types.h @@ -61,12 +61,6 @@ enum class ScreenshotPermission : u32 { Disable = 2, }; -struct FocusHandlingMode { - bool notify; - bool background; - bool suspend; -}; - enum class IdleTimeDetectionExtension : u32 { Disabled = 0, Extended = 1, @@ -239,7 +233,6 @@ struct ApplicationPlayStatistics { static_assert(sizeof(ApplicationPlayStatistics) == 0x18, "ApplicationPlayStatistics has incorrect size."); -using AppletResourceUserId = u64; using ProgramId = u64; struct Applet; diff --git a/src/core/hle/service/am/applet.cpp b/src/core/hle/service/am/applet.cpp index 5b9056c12..6847f250c 100644 --- a/src/core/hle/service/am/applet.cpp +++ b/src/core/hle/service/am/applet.cpp @@ -1,27 +1,71 @@ // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include "common/scope_exit.h" - #include "core/core.h" -#include "core/hle/service/am/am_results.h" #include "core/hle/service/am/applet.h" #include "core/hle/service/am/applet_manager.h" namespace Service::AM { -Applet::Applet(Core::System& system, std::unique_ptr process_) - : context(system, "Applet"), message_queue(system), process(std::move(process_)), - hid_registration(system, *process), gpu_error_detected_event(context), - friend_invitation_storage_channel_event(context), notification_storage_channel_event(context), - health_warning_disappeared_system_event(context), acquired_sleep_lock_event(context), - pop_from_general_channel_event(context), library_applet_launchable_event(context), - accumulated_suspended_tick_changed_event(context), sleep_lock_event(context) { +Applet::Applet(Core::System& system, std::unique_ptr process_, bool is_application) + : context(system, "Applet"), lifecycle_manager(system, context, is_application), + process(std::move(process_)), hid_registration(system, *process), + gpu_error_detected_event(context), friend_invitation_storage_channel_event(context), + notification_storage_channel_event(context), health_warning_disappeared_system_event(context), + acquired_sleep_lock_event(context), pop_from_general_channel_event(context), + library_applet_launchable_event(context), accumulated_suspended_tick_changed_event(context), + sleep_lock_event(context), state_changed_event(context) { - aruid = process->GetProcessId(); + aruid.pid = process->GetProcessId(); program_id = process->GetProgramId(); } Applet::~Applet() = default; +void Applet::UpdateSuspensionStateLocked(bool force_message) { + // Remove any forced resumption. + lifecycle_manager.RemoveForceResumeIfPossible(); + + // Check if we're runnable. + const bool curr_activity_runnable = lifecycle_manager.IsRunnable(); + const bool prev_activity_runnable = is_activity_runnable; + const bool was_changed = curr_activity_runnable != prev_activity_runnable; + + if (was_changed) { + if (curr_activity_runnable) { + process->Suspend(false); + } else { + process->Suspend(true); + lifecycle_manager.RequestResumeNotification(); + } + + is_activity_runnable = curr_activity_runnable; + } + + if (lifecycle_manager.GetForcedSuspend()) { + // TODO: why is this allowed? + return; + } + + // Signal if the focus state was changed or the process state was changed. + if (lifecycle_manager.UpdateRequestedFocusState() || was_changed || force_message) { + lifecycle_manager.SignalSystemEventIfNeeded(); + } +} + +void Applet::SetInteractibleLocked(bool interactible) { + if (is_interactible == interactible) { + return; + } + + is_interactible = interactible; + + hid_registration.EnableAppletToGetInput(interactible && !lifecycle_manager.GetExitRequested()); +} + +void Applet::OnProcessTerminatedLocked() { + is_completed = true; + state_changed_event.Signal(); +} + } // namespace Service::AM diff --git a/src/core/hle/service/am/applet.h b/src/core/hle/service/am/applet.h index ad602153e..571904fab 100644 --- a/src/core/hle/service/am/applet.h +++ b/src/core/hle/service/am/applet.h @@ -3,25 +3,28 @@ #pragma once +#include #include #include "common/math_util.h" #include "core/hle/service/apm/apm_controller.h" #include "core/hle/service/caps/caps_types.h" +#include "core/hle/service/cmif_types.h" #include "core/hle/service/kernel_helpers.h" #include "core/hle/service/os/event.h" +#include "core/hle/service/os/process.h" #include "core/hle/service/service.h" #include "core/hle/service/am/am_types.h" -#include "core/hle/service/am/applet_message_queue.h" #include "core/hle/service/am/display_layer_manager.h" #include "core/hle/service/am/hid_registration.h" -#include "core/hle/service/am/process.h" +#include "core/hle/service/am/lifecycle_manager.h" +#include "core/hle/service/am/process_holder.h" namespace Service::AM { struct Applet { - explicit Applet(Core::System& system, std::unique_ptr process_); + explicit Applet(Core::System& system, std::unique_ptr process_, bool is_application); ~Applet(); // Lock @@ -30,11 +33,13 @@ struct Applet { // Event creation helper KernelHelpers::ServiceContext context; - // Applet message queue - AppletMessageQueue message_queue; + // Lifecycle manager + LifecycleManager lifecycle_manager; // Process std::unique_ptr process; + std::optional process_holder; + bool is_process_running{}; // Creation state AppletId applet_id{}; @@ -75,11 +80,9 @@ struct Applet { bool game_play_recording_supported{}; GamePlayRecordingState game_play_recording_state{GamePlayRecordingState::Disabled}; bool jit_service_launched{}; - bool is_running{}; bool application_crash_report_enabled{}; // Common state - FocusState focus_state{}; bool sleep_lock_enabled{}; bool vr_mode_enabled{}; bool lcd_backlight_off_enabled{}; @@ -93,15 +96,12 @@ struct Applet { // Caller applet std::weak_ptr caller_applet{}; std::shared_ptr caller_applet_broker{}; + std::list> child_applets{}; + bool is_completed{}; // Self state bool exit_locked{}; s32 fatal_section_count{}; - bool operation_mode_changed_notification_enabled{true}; - bool performance_mode_changed_notification_enabled{true}; - FocusHandlingMode focus_handling_mode{}; - bool restart_message_enabled{}; - bool out_of_focus_suspension_enabled{true}; Capture::AlbumImageOrientation album_image_orientation{}; bool handles_request_to_display{}; ScreenshotPermission screenshot_permission{}; @@ -110,6 +110,9 @@ struct Applet { u64 suspended_ticks{}; bool album_image_taken_notification_enabled{}; bool record_volume_muted{}; + bool is_activity_runnable{}; + bool is_interactible{true}; + bool window_visible{true}; // Events Event gpu_error_detected_event; @@ -121,9 +124,15 @@ struct Applet { Event library_applet_launchable_event; Event accumulated_suspended_tick_changed_event; Event sleep_lock_event; + Event state_changed_event; // Frontend state std::shared_ptr frontend{}; + + // Process state management + void UpdateSuspensionStateLocked(bool force_message); + void SetInteractibleLocked(bool interactible); + void OnProcessTerminatedLocked(); }; } // namespace Service::AM diff --git a/src/core/hle/service/am/applet_data_broker.cpp b/src/core/hle/service/am/applet_data_broker.cpp index 9057244a9..fff78c5af 100644 --- a/src/core/hle/service/am/applet_data_broker.cpp +++ b/src/core/hle/service/am/applet_data_broker.cpp @@ -44,24 +44,8 @@ Kernel::KReadableEvent* AppletStorageChannel::GetEvent() { AppletDataBroker::AppletDataBroker(Core::System& system_) : system(system_), context(system_, "AppletDataBroker"), in_data(context), - interactive_in_data(context), out_data(context), interactive_out_data(context), - state_changed_event(context), is_completed(false) {} + interactive_in_data(context), out_data(context), interactive_out_data(context) {} AppletDataBroker::~AppletDataBroker() = default; -void AppletDataBroker::SignalCompletion() { - { - std::scoped_lock lk{lock}; - - if (is_completed) { - return; - } - - is_completed = true; - state_changed_event.Signal(); - } - - system.GetAppletManager().FocusStateChanged(); -} - } // namespace Service::AM diff --git a/src/core/hle/service/am/applet_data_broker.h b/src/core/hle/service/am/applet_data_broker.h index 5a1d43c11..2718f608a 100644 --- a/src/core/hle/service/am/applet_data_broker.h +++ b/src/core/hle/service/am/applet_data_broker.h @@ -53,16 +53,6 @@ public: return interactive_out_data; } - Event& GetStateChangedEvent() { - return state_changed_event; - } - - bool IsCompleted() const { - return is_completed; - } - - void SignalCompletion(); - private: Core::System& system; KernelHelpers::ServiceContext context; @@ -71,10 +61,6 @@ private: AppletStorageChannel interactive_in_data; AppletStorageChannel out_data; AppletStorageChannel interactive_out_data; - Event state_changed_event; - - std::mutex lock; - bool is_completed; }; } // namespace Service::AM diff --git a/src/core/hle/service/am/applet_manager.cpp b/src/core/hle/service/am/applet_manager.cpp index 2e109181d..c6b7ec8bb 100644 --- a/src/core/hle/service/am/applet_manager.cpp +++ b/src/core/hle/service/am/applet_manager.cpp @@ -13,6 +13,7 @@ #include "core/hle/service/am/frontend/applet_mii_edit_types.h" #include "core/hle/service/am/frontend/applet_software_keyboard_types.h" #include "core/hle/service/am/service/storage.h" +#include "core/hle/service/am/window_system.h" #include "hid_core/hid_types.h" namespace Service::AM { @@ -225,49 +226,46 @@ void PushInShowSoftwareKeyboard(Core::System& system, AppletStorageChannel& chan } // namespace AppletManager::AppletManager(Core::System& system) : m_system(system) {} -AppletManager::~AppletManager() { - this->Reset(); -} - -void AppletManager::InsertApplet(std::shared_ptr applet) { - std::scoped_lock lk{m_lock}; - - m_applets.emplace(applet->aruid, std::move(applet)); -} - -void AppletManager::TerminateAndRemoveApplet(AppletResourceUserId aruid) { - std::shared_ptr applet; - bool should_stop = false; - { - std::scoped_lock lk{m_lock}; - - const auto it = m_applets.find(aruid); - if (it == m_applets.end()) { - return; - } - - applet = it->second; - m_applets.erase(it); - - should_stop = m_applets.empty(); - } - - // Terminate process. - applet->process->Terminate(); - - // If there were no applets left, stop emulation. - if (should_stop) { - m_system.Exit(); - } -} +AppletManager::~AppletManager() = default; void AppletManager::CreateAndInsertByFrontendAppletParameters( - AppletResourceUserId aruid, const FrontendAppletParameters& params) { - // TODO: this should be run inside AM so that the events will have a parent process - // TODO: have am create the guest process - auto applet = std::make_shared(m_system, std::make_unique(m_system)); + std::unique_ptr process, const FrontendAppletParameters& params) { + { + std::scoped_lock lk{m_lock}; + m_pending_process = std::move(process); + m_pending_parameters = params; + } + m_cv.notify_all(); +} + +void AppletManager::RequestExit() { + std::scoped_lock lk{m_lock}; + if (m_window_system) { + m_window_system->OnExitRequested(); + } +} + +void AppletManager::OperationModeChanged() { + std::scoped_lock lk{m_lock}; + if (m_window_system) { + m_window_system->OnOperationModeChanged(); + } +} + +void AppletManager::SetWindowSystem(WindowSystem* window_system) { + std::unique_lock lk{m_lock}; + + m_window_system = window_system; + if (!m_window_system) { + return; + } + + m_cv.wait(lk, [&] { return m_pending_process != nullptr; }); + + const auto& params = m_pending_parameters; + auto applet = std::make_shared(m_system, std::move(m_pending_process), + params.applet_id == AppletId::Application); - applet->aruid = aruid; applet->program_id = params.program_id; applet->applet_id = params.applet_id; applet->type = params.applet_type; @@ -322,59 +320,19 @@ void AppletManager::CreateAndInsertByFrontendAppletParameters( } // Applet was started by frontend, so it is foreground. - applet->message_queue.PushMessage(AppletMessage::ChangeIntoForeground); - applet->message_queue.PushMessage(AppletMessage::FocusStateChanged); - applet->focus_state = FocusState::InFocus; + applet->lifecycle_manager.SetFocusState(FocusState::InFocus); - this->InsertApplet(std::move(applet)); -} - -std::shared_ptr AppletManager::GetByAppletResourceUserId(AppletResourceUserId aruid) const { - std::scoped_lock lk{m_lock}; - - if (const auto it = m_applets.find(aruid); it != m_applets.end()) { - return it->second; + if (applet->applet_id == AppletId::QLaunch) { + applet->lifecycle_manager.SetFocusHandlingMode(false); + applet->lifecycle_manager.SetOutOfFocusSuspendingEnabled(false); + m_window_system->TrackApplet(applet, false); + m_window_system->RequestHomeMenuToGetForeground(); + } else { + m_window_system->TrackApplet(applet, true); + m_window_system->RequestApplicationToGetForeground(); } - return {}; -} - -void AppletManager::Reset() { - std::scoped_lock lk{m_lock}; - - m_applets.clear(); -} - -void AppletManager::RequestExit() { - std::scoped_lock lk{m_lock}; - - for (const auto& [aruid, applet] : m_applets) { - applet->message_queue.RequestExit(); - } -} - -void AppletManager::RequestResume() { - std::scoped_lock lk{m_lock}; - - for (const auto& [aruid, applet] : m_applets) { - applet->message_queue.RequestResume(); - } -} - -void AppletManager::OperationModeChanged() { - std::scoped_lock lk{m_lock}; - - for (const auto& [aruid, applet] : m_applets) { - applet->message_queue.OperationModeChanged(); - } -} - -void AppletManager::FocusStateChanged() { - std::scoped_lock lk{m_lock}; - - for (const auto& [aruid, applet] : m_applets) { - applet->message_queue.FocusStateChanged(); - } + applet->process->Run(); } } // namespace Service::AM diff --git a/src/core/hle/service/am/applet_manager.h b/src/core/hle/service/am/applet_manager.h index 4875de309..fbdc77140 100644 --- a/src/core/hle/service/am/applet_manager.h +++ b/src/core/hle/service/am/applet_manager.h @@ -3,17 +3,23 @@ #pragma once -#include +#include #include -#include "core/hle/service/am/applet.h" +#include "core/hle/service/am/am_types.h" namespace Core { class System; } +namespace Service { +class Process; +} + namespace Service::AM { +class WindowSystem; + enum class LaunchType { FrontendInitiated, ApplicationInitiated, @@ -33,27 +39,24 @@ public: explicit AppletManager(Core::System& system); ~AppletManager(); - void InsertApplet(std::shared_ptr applet); - void TerminateAndRemoveApplet(AppletResourceUserId aruid); - - void CreateAndInsertByFrontendAppletParameters(AppletResourceUserId aruid, + void CreateAndInsertByFrontendAppletParameters(std::unique_ptr process, const FrontendAppletParameters& params); - std::shared_ptr GetByAppletResourceUserId(AppletResourceUserId aruid) const; - - void Reset(); - void RequestExit(); - void RequestResume(); void OperationModeChanged(); - void FocusStateChanged(); + +public: + void SetWindowSystem(WindowSystem* window_system); private: Core::System& m_system; - mutable std::mutex m_lock{}; - std::map> m_applets{}; + std::mutex m_lock; + std::condition_variable m_cv; - // AudioController state goes here + WindowSystem* m_window_system{}; + + FrontendAppletParameters m_pending_parameters{}; + std::unique_ptr m_pending_process{}; }; } // namespace Service::AM diff --git a/src/core/hle/service/am/applet_message_queue.cpp b/src/core/hle/service/am/applet_message_queue.cpp deleted file mode 100644 index 83c3c5a55..000000000 --- a/src/core/hle/service/am/applet_message_queue.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include "core/hle/service/am/applet_message_queue.h" -#include "core/hle/service/ipc_helpers.h" - -namespace Service::AM { - -AppletMessageQueue::AppletMessageQueue(Core::System& system) - : service_context{system, "AppletMessageQueue"} { - on_new_message = service_context.CreateEvent("AMMessageQueue:OnMessageReceived"); - on_operation_mode_changed = service_context.CreateEvent("AMMessageQueue:OperationModeChanged"); -} - -AppletMessageQueue::~AppletMessageQueue() { - service_context.CloseEvent(on_new_message); - service_context.CloseEvent(on_operation_mode_changed); -} - -Kernel::KReadableEvent& AppletMessageQueue::GetMessageReceiveEvent() { - return on_new_message->GetReadableEvent(); -} - -Kernel::KReadableEvent& AppletMessageQueue::GetOperationModeChangedEvent() { - return on_operation_mode_changed->GetReadableEvent(); -} - -void AppletMessageQueue::PushMessage(AppletMessage msg) { - { - std::scoped_lock lk{lock}; - messages.push(msg); - } - on_new_message->Signal(); -} - -AppletMessage AppletMessageQueue::PopMessage() { - std::scoped_lock lk{lock}; - if (messages.empty()) { - on_new_message->Clear(); - return AppletMessage::None; - } - auto msg = messages.front(); - messages.pop(); - if (messages.empty()) { - on_new_message->Clear(); - } - return msg; -} - -std::size_t AppletMessageQueue::GetMessageCount() const { - std::scoped_lock lk{lock}; - return messages.size(); -} - -void AppletMessageQueue::RequestExit() { - PushMessage(AppletMessage::Exit); -} - -void AppletMessageQueue::RequestResume() { - PushMessage(AppletMessage::Resume); -} - -void AppletMessageQueue::FocusStateChanged() { - PushMessage(AppletMessage::FocusStateChanged); -} - -void AppletMessageQueue::OperationModeChanged() { - PushMessage(AppletMessage::OperationModeChanged); - PushMessage(AppletMessage::PerformanceModeChanged); - on_operation_mode_changed->Signal(); -} - -} // namespace Service::AM diff --git a/src/core/hle/service/am/applet_message_queue.h b/src/core/hle/service/am/applet_message_queue.h deleted file mode 100644 index 429b77d37..000000000 --- a/src/core/hle/service/am/applet_message_queue.h +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include - -#include "core/hle/service/am/am_types.h" -#include "core/hle/service/kernel_helpers.h" -#include "core/hle/service/service.h" - -namespace Kernel { -class KReadableEvent; -} // namespace Kernel - -namespace Service::AM { - -class AppletMessageQueue { -public: - explicit AppletMessageQueue(Core::System& system); - ~AppletMessageQueue(); - - Kernel::KReadableEvent& GetMessageReceiveEvent(); - Kernel::KReadableEvent& GetOperationModeChangedEvent(); - void PushMessage(AppletMessage msg); - AppletMessage PopMessage(); - std::size_t GetMessageCount() const; - void RequestExit(); - void RequestResume(); - void FocusStateChanged(); - void OperationModeChanged(); - -private: - KernelHelpers::ServiceContext service_context; - - Kernel::KEvent* on_new_message; - Kernel::KEvent* on_operation_mode_changed; - - mutable std::mutex lock; - std::queue messages; -}; - -} // namespace Service::AM diff --git a/src/core/hle/service/am/button_poller.cpp b/src/core/hle/service/am/button_poller.cpp new file mode 100644 index 000000000..aab397085 --- /dev/null +++ b/src/core/hle/service/am/button_poller.cpp @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "core/core.h" +#include "core/hle/service/am/button_poller.h" +#include "core/hle/service/am/window_system.h" +#include "hid_core/frontend/emulated_controller.h" +#include "hid_core/hid_core.h" +#include "hid_core/hid_types.h" + +namespace Service::AM { + +namespace { + +ButtonPressDuration ClassifyPressDuration(std::chrono::steady_clock::time_point start) { + using namespace std::chrono_literals; + + const auto dur = std::chrono::steady_clock::now() - start; + + // TODO: determine actual thresholds + // TODO: these are likely different for each button + if (dur < 500ms) { + return ButtonPressDuration::ShortPressing; + } else if (dur < 1000ms) { + return ButtonPressDuration::MiddlePressing; + } else { + return ButtonPressDuration::LongPressing; + } +} + +} // namespace + +ButtonPoller::ButtonPoller(Core::System& system, WindowSystem& window_system) + : m_window_system(window_system) { + // TODO: am reads this from the home button state in hid, which is controller-agnostic. + Core::HID::ControllerUpdateCallback engine_callback{ + .on_change = + [this](Core::HID::ControllerTriggerType type) { + if (type == Core::HID::ControllerTriggerType::Button) { + this->OnButtonStateChanged(); + } + }, + .is_npad_service = true, + }; + + m_handheld = system.HIDCore().GetEmulatedController(Core::HID::NpadIdType::Handheld); + m_handheld_key = m_handheld->SetCallback(engine_callback); + m_player1 = system.HIDCore().GetEmulatedController(Core::HID::NpadIdType::Player1); + m_player1_key = m_player1->SetCallback(engine_callback); +} + +ButtonPoller::~ButtonPoller() { + m_handheld->DeleteCallback(m_handheld_key); + m_player1->DeleteCallback(m_player1_key); +} + +void ButtonPoller::OnButtonStateChanged() { + const bool home_button = + m_handheld->GetHomeButtons().home.Value() || m_player1->GetHomeButtons().home.Value(); + const bool capture_button = m_handheld->GetCaptureButtons().capture.Value() || + m_player1->GetCaptureButtons().capture.Value(); + + // Buttons pressed which were not previously pressed + if (home_button && !m_home_button_press_start) { + m_home_button_press_start = std::chrono::steady_clock::now(); + } + if (capture_button && !m_capture_button_press_start) { + m_capture_button_press_start = std::chrono::steady_clock::now(); + } + // if (power_button && !m_power_button_press_start) { + // m_power_button_press_start = std::chrono::steady_clock::now(); + // } + + // Buttons released which were previously held + if (!home_button && m_home_button_press_start) { + m_window_system.OnHomeButtonPressed(ClassifyPressDuration(*m_home_button_press_start)); + m_home_button_press_start = std::nullopt; + } + if (!capture_button && m_capture_button_press_start) { + // TODO + m_capture_button_press_start = std::nullopt; + } + // if (!power_button && m_power_button_press_start) { + // // TODO + // m_power_button_press_start = std::nullopt; + // } +} + +} // namespace Service::AM diff --git a/src/core/hle/service/am/button_poller.h b/src/core/hle/service/am/button_poller.h new file mode 100644 index 000000000..b1c39aad3 --- /dev/null +++ b/src/core/hle/service/am/button_poller.h @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include "hid_core/frontend/emulated_controller.h" + +namespace Core { +namespace HID { +class EmulatedController; +} + +class System; +} // namespace Core + +namespace Service::AM { + +class WindowSystem; + +class ButtonPoller { +public: + explicit ButtonPoller(Core::System& system, WindowSystem& window_system); + ~ButtonPoller(); + +private: + void OnButtonStateChanged(); + +private: + WindowSystem& m_window_system; + + Core::HID::EmulatedController* m_handheld{}; + int m_handheld_key{}; + Core::HID::EmulatedController* m_player1{}; + int m_player1_key{}; + + std::optional m_home_button_press_start{}; + std::optional m_capture_button_press_start{}; + std::optional m_power_button_press_start{}; +}; + +} // namespace Service::AM diff --git a/src/core/hle/service/am/event_observer.cpp b/src/core/hle/service/am/event_observer.cpp new file mode 100644 index 000000000..5d1d303ed --- /dev/null +++ b/src/core/hle/service/am/event_observer.cpp @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "core/core.h" +#include "core/hle/kernel/k_event.h" +#include "core/hle/service/am/applet.h" +#include "core/hle/service/am/event_observer.h" +#include "core/hle/service/am/window_system.h" + +namespace Service::AM { + +enum class UserDataTag : u32 { + WakeupEvent, + AppletProcess, +}; + +EventObserver::EventObserver(Core::System& system, WindowSystem& window_system) + : m_system(system), m_context(system, "am:EventObserver"), m_window_system(window_system), + m_wakeup_event(m_context), m_wakeup_holder(m_wakeup_event.GetHandle()) { + m_window_system.SetEventObserver(this); + m_wakeup_holder.SetUserData(static_cast(UserDataTag::WakeupEvent)); + m_wakeup_holder.LinkToMultiWait(std::addressof(m_multi_wait)); + m_thread = std::thread([&] { this->ThreadFunc(); }); +} + +EventObserver::~EventObserver() { + // Signal thread and wait for processing to finish. + m_stop_source.request_stop(); + m_wakeup_event.Signal(); + m_thread.join(); + + // Free remaining owned sessions. + auto it = m_process_holder_list.begin(); + while (it != m_process_holder_list.end()) { + // Get the holder. + auto* const holder = std::addressof(*it); + + // Remove from the list. + it = m_process_holder_list.erase(it); + + // Free the holder. + delete holder; + } +} + +void EventObserver::TrackAppletProcess(Applet& applet) { + // Don't observe dummy processes. + if (!applet.process->IsInitialized()) { + return; + } + + // Allocate new holder. + auto* holder = new ProcessHolder(applet, *applet.process); + holder->SetUserData(static_cast(UserDataTag::AppletProcess)); + + // Insert into list. + { + std::scoped_lock lk{m_lock}; + m_process_holder_list.push_back(*holder); + holder->LinkToMultiWait(std::addressof(m_deferred_wait_list)); + } + + // Signal wakeup. + m_wakeup_event.Signal(); +} + +void EventObserver::RequestUpdate() { + m_wakeup_event.Signal(); +} + +void EventObserver::LinkDeferred() { + std::scoped_lock lk{m_lock}; + m_multi_wait.MoveAll(std::addressof(m_deferred_wait_list)); +} + +MultiWaitHolder* EventObserver::WaitSignaled() { + while (true) { + this->LinkDeferred(); + + // If we're done, return before we start waiting. + if (m_stop_source.stop_requested()) { + return nullptr; + } + + auto* selected = m_multi_wait.WaitAny(m_system.Kernel()); + if (selected != std::addressof(m_wakeup_holder)) { + // Unlink the process. + selected->UnlinkFromMultiWait(); + } + + return selected; + } +} + +void EventObserver::Process(MultiWaitHolder* holder) { + switch (static_cast(holder->GetUserData())) { + case UserDataTag::WakeupEvent: + this->OnWakeupEvent(holder); + break; + case UserDataTag::AppletProcess: + this->OnProcessEvent(static_cast(holder)); + break; + default: + UNREACHABLE(); + } +} + +void EventObserver::OnWakeupEvent(MultiWaitHolder* holder) { + m_wakeup_event.Clear(); + + // Perform recalculation. + m_window_system.Update(); +} + +void EventObserver::OnProcessEvent(ProcessHolder* holder) { + // Check process state. + auto& applet = holder->GetApplet(); + auto& process = holder->GetProcess(); + + { + std::scoped_lock lk{m_lock, applet.lock}; + if (process.IsTerminated()) { + // Destroy the holder. + this->DestroyAppletProcessHolderLocked(holder); + } else { + // Reset signaled state. + process.ResetSignal(); + + // Relink wakeup event. + holder->LinkToMultiWait(std::addressof(m_deferred_wait_list)); + } + + // Set running. + applet.is_process_running = process.IsRunning(); + } + + // Perform recalculation. + m_window_system.Update(); +} + +void EventObserver::DestroyAppletProcessHolderLocked(ProcessHolder* holder) { + // Remove from owned list. + m_process_holder_list.erase(m_process_holder_list.iterator_to(*holder)); + + // Destroy and free. + delete holder; +} + +void EventObserver::ThreadFunc() { + Common::SetCurrentThreadName("am:EventObserver"); + + while (true) { + auto* signaled_holder = this->WaitSignaled(); + if (!signaled_holder) { + break; + } + + this->Process(signaled_holder); + } +} + +} // namespace Service::AM diff --git a/src/core/hle/service/am/event_observer.h b/src/core/hle/service/am/event_observer.h new file mode 100644 index 000000000..3e52e8494 --- /dev/null +++ b/src/core/hle/service/am/event_observer.h @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/polyfill_thread.h" +#include "common/thread.h" +#include "core/hle/service/kernel_helpers.h" +#include "core/hle/service/os/event.h" +#include "core/hle/service/os/multi_wait.h" + +namespace Core { +class System; +} + +namespace Service::AM { + +struct Applet; +class ProcessHolder; +class WindowSystem; + +class EventObserver { +public: + explicit EventObserver(Core::System& system, WindowSystem& window_system); + ~EventObserver(); + + void TrackAppletProcess(Applet& applet); + void RequestUpdate(); + +private: + void LinkDeferred(); + MultiWaitHolder* WaitSignaled(); + void Process(MultiWaitHolder* holder); + bool WaitAndProcessImpl(); + void LoopProcess(); + +private: + void OnWakeupEvent(MultiWaitHolder* holder); + void OnProcessEvent(ProcessHolder* holder); + +private: + void DestroyAppletProcessHolderLocked(ProcessHolder* holder); + +private: + void ThreadFunc(); + +private: + // System reference and context. + Core::System& m_system; + KernelHelpers::ServiceContext m_context; + + // Window manager. + WindowSystem& m_window_system; + + // Guest event handle to wake up the event loop processor. + Event m_wakeup_event; + MultiWaitHolder m_wakeup_holder; + + // Mutex to protect remaining members. + std::mutex m_lock{}; + + // List of owned process holders. + Common::IntrusiveListBaseTraits::ListType m_process_holder_list; + + // Multi-wait objects for new tasks. + MultiWait m_multi_wait; + MultiWait m_deferred_wait_list; + + // Processing thread. + std::thread m_thread{}; + std::stop_source m_stop_source{}; +}; + +} // namespace Service::AM diff --git a/src/core/hle/service/am/frontend/applets.cpp b/src/core/hle/service/am/frontend/applets.cpp index e662c6cd6..cdd431857 100644 --- a/src/core/hle/service/am/frontend/applets.cpp +++ b/src/core/hle/service/am/frontend/applets.cpp @@ -69,7 +69,11 @@ void FrontendApplet::PushInteractiveOutData(std::shared_ptr storage) { } void FrontendApplet::Exit() { - applet.lock()->caller_applet_broker->SignalCompletion(); + auto applet_ = applet.lock(); + + std::scoped_lock lk{applet_->lock}; + applet_->is_completed = true; + applet_->state_changed_event.Signal(); } FrontendAppletSet::FrontendAppletSet() = default; diff --git a/src/core/hle/service/am/hid_registration.cpp b/src/core/hle/service/am/hid_registration.cpp index 8ed49bac1..ea4bd8f45 100644 --- a/src/core/hle/service/am/hid_registration.cpp +++ b/src/core/hle/service/am/hid_registration.cpp @@ -3,24 +3,28 @@ #include "core/core.h" #include "core/hle/service/am/hid_registration.h" -#include "core/hle/service/am/process.h" #include "core/hle/service/hid/hid_server.h" +#include "core/hle/service/os/process.h" #include "core/hle/service/sm/sm.h" #include "hid_core/resource_manager.h" namespace Service::AM { HidRegistration::HidRegistration(Core::System& system, Process& process) : m_process(process) { - m_hid_server = system.ServiceManager().GetService("hid"); + m_hid_server = system.ServiceManager().GetService("hid", true); if (m_process.IsInitialized()) { m_hid_server->GetResourceManager()->RegisterAppletResourceUserId(m_process.GetProcessId(), true); + m_hid_server->GetResourceManager()->SetAruidValidForVibration(m_process.GetProcessId(), + true); } } HidRegistration::~HidRegistration() { if (m_process.IsInitialized()) { + m_hid_server->GetResourceManager()->SetAruidValidForVibration(m_process.GetProcessId(), + false); m_hid_server->GetResourceManager()->UnregisterAppletResourceUserId( m_process.GetProcessId()); } @@ -28,6 +32,8 @@ HidRegistration::~HidRegistration() { void HidRegistration::EnableAppletToGetInput(bool enable) { if (m_process.IsInitialized()) { + m_hid_server->GetResourceManager()->SetAruidValidForVibration(m_process.GetProcessId(), + enable); m_hid_server->GetResourceManager()->EnableInput(m_process.GetProcessId(), enable); } } diff --git a/src/core/hle/service/am/hid_registration.h b/src/core/hle/service/am/hid_registration.h index 67cd84961..54f42af18 100644 --- a/src/core/hle/service/am/hid_registration.h +++ b/src/core/hle/service/am/hid_registration.h @@ -13,9 +13,11 @@ namespace Service::HID { class IHidServer; } -namespace Service::AM { - +namespace Service { class Process; +} + +namespace Service::AM { class HidRegistration { public: diff --git a/src/core/hle/service/am/lifecycle_manager.cpp b/src/core/hle/service/am/lifecycle_manager.cpp new file mode 100644 index 000000000..0dac27ed0 --- /dev/null +++ b/src/core/hle/service/am/lifecycle_manager.cpp @@ -0,0 +1,379 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "core/hle/service/am/lifecycle_manager.h" + +namespace Service::AM { + +LifecycleManager::LifecycleManager(Core::System& system, KernelHelpers::ServiceContext& context, + bool is_application) + : m_system_event(context), m_operation_mode_changed_system_event(context), + m_is_application(is_application) {} + +LifecycleManager::~LifecycleManager() = default; + +Event& LifecycleManager::GetSystemEvent() { + return m_system_event; +} + +Event& LifecycleManager::GetOperationModeChangedSystemEvent() { + return m_operation_mode_changed_system_event; +} + +void LifecycleManager::PushUnorderedMessage(AppletMessage message) { + m_unordered_messages.push_back(message); + this->SignalSystemEventIfNeeded(); +} + +AppletMessage LifecycleManager::PopMessageInOrderOfPriority() { + if (m_has_resume) { + m_has_resume = false; + return AppletMessage::Resume; + } + + if (m_has_acknowledged_exit != m_has_requested_exit) { + m_has_acknowledged_exit = m_has_requested_exit; + return AppletMessage::Exit; + } + + if (m_focus_state_changed_notification_enabled) { + if (!m_is_application) { + if (m_requested_focus_state != m_acknowledged_focus_state) { + m_acknowledged_focus_state = m_requested_focus_state; + switch (m_requested_focus_state) { + case FocusState::InFocus: + return AppletMessage::ChangeIntoForeground; + case FocusState::NotInFocus: + return AppletMessage::ChangeIntoBackground; + default: + ASSERT(false); + } + } + } else if (m_has_focus_state_changed) { + m_has_focus_state_changed = false; + return AppletMessage::FocusStateChanged; + } + } + + if (m_has_requested_request_to_prepare_sleep != m_has_acknowledged_request_to_prepare_sleep) { + m_has_acknowledged_request_to_prepare_sleep = true; + return AppletMessage::RequestToPrepareSleep; + } + + if (m_requested_request_to_display_state != m_acknowledged_request_to_display_state) { + m_acknowledged_request_to_display_state = m_requested_request_to_display_state; + return AppletMessage::RequestToDisplay; + } + + if (m_has_operation_mode_changed) { + m_has_operation_mode_changed = false; + return AppletMessage::OperationModeChanged; + } + + if (m_has_performance_mode_changed) { + m_has_performance_mode_changed = false; + return AppletMessage::PerformanceModeChanged; + } + + if (m_has_sd_card_removed) { + m_has_sd_card_removed = false; + return AppletMessage::SdCardRemoved; + } + + if (m_has_sleep_required_by_high_temperature) { + m_has_sleep_required_by_high_temperature = false; + return AppletMessage::SleepRequiredByHighTemperature; + } + + if (m_has_sleep_required_by_low_battery) { + m_has_sleep_required_by_low_battery = false; + return AppletMessage::SleepRequiredByLowBattery; + } + + if (m_has_auto_power_down) { + m_has_auto_power_down = false; + return AppletMessage::AutoPowerDown; + } + + if (m_has_album_screen_shot_taken) { + m_has_album_screen_shot_taken = false; + return AppletMessage::AlbumScreenShotTaken; + } + + if (m_has_album_recording_saved) { + m_has_album_recording_saved = false; + return AppletMessage::AlbumRecordingSaved; + } + + if (!m_unordered_messages.empty()) { + const auto message = m_unordered_messages.front(); + m_unordered_messages.pop_front(); + return message; + } + + return AppletMessage::None; +} + +bool LifecycleManager::ShouldSignalSystemEvent() { + if (m_focus_state_changed_notification_enabled) { + if (!m_is_application) { + if (m_requested_focus_state != m_acknowledged_focus_state) { + return true; + } + } else if (m_has_focus_state_changed) { + return true; + } + } + + return !m_unordered_messages.empty() || m_has_resume || + (m_has_requested_exit != m_has_acknowledged_exit) || + (m_has_requested_request_to_prepare_sleep != + m_has_acknowledged_request_to_prepare_sleep) || + m_has_operation_mode_changed || m_has_performance_mode_changed || + m_has_sd_card_removed || m_has_sleep_required_by_high_temperature || + m_has_sleep_required_by_low_battery || m_has_auto_power_down || + (m_requested_request_to_display_state != m_acknowledged_request_to_display_state) || + m_has_album_screen_shot_taken || m_has_album_recording_saved; +} + +void LifecycleManager::OnOperationAndPerformanceModeChanged() { + if (m_operation_mode_changed_notification_enabled) { + m_has_operation_mode_changed = true; + } + if (m_performance_mode_changed_notification_enabled) { + m_has_performance_mode_changed = true; + } + m_operation_mode_changed_system_event.Signal(); + this->SignalSystemEventIfNeeded(); +} + +void LifecycleManager::SignalSystemEventIfNeeded() { + // Check our cached value for the system event. + const bool applet_message_available = m_applet_message_available; + + // If it's not current, we need to do an update, either clearing or signaling. + if (applet_message_available != this->ShouldSignalSystemEvent()) { + if (!applet_message_available) { + m_system_event.Signal(); + m_applet_message_available = true; + } else { + m_system_event.Clear(); + m_applet_message_available = false; + } + } +} + +bool LifecycleManager::PopMessage(AppletMessage* out_message) { + const auto message = this->PopMessageInOrderOfPriority(); + this->SignalSystemEventIfNeeded(); + + *out_message = message; + return message != AppletMessage::None; +} + +void LifecycleManager::SetFocusHandlingMode(bool suspend) { + switch (m_focus_handling_mode) { + case FocusHandlingMode::AlwaysSuspend: + case FocusHandlingMode::SuspendHomeSleep: + if (!suspend) { + // Disallow suspension. + m_focus_handling_mode = FocusHandlingMode::NoSuspend; + } + break; + case FocusHandlingMode::NoSuspend: + if (suspend) { + // Allow suspension temporally. + m_focus_handling_mode = FocusHandlingMode::SuspendHomeSleep; + } + break; + } +} + +void LifecycleManager::SetOutOfFocusSuspendingEnabled(bool enabled) { + switch (m_focus_handling_mode) { + case FocusHandlingMode::AlwaysSuspend: + if (!enabled) { + // Allow suspension temporally. + m_focus_handling_mode = FocusHandlingMode::SuspendHomeSleep; + } + break; + case FocusHandlingMode::SuspendHomeSleep: + case FocusHandlingMode::NoSuspend: + if (enabled) { + // Allow suspension. + m_focus_handling_mode = FocusHandlingMode::AlwaysSuspend; + } + break; + } +} + +void LifecycleManager::RemoveForceResumeIfPossible() { + // If resume is not forced, we have nothing to do. + if (m_suspend_mode != SuspendMode::ForceResume) { + return; + } + + // Check activity state. + // If we are already resumed, we can remove the forced state. + switch (m_activity_state) { + case ActivityState::ForegroundVisible: + case ActivityState::ForegroundObscured: + m_suspend_mode = SuspendMode::NoOverride; + return; + + default: + break; + } + + // Check focus handling mode. + switch (m_focus_handling_mode) { + case FocusHandlingMode::AlwaysSuspend: + case FocusHandlingMode::SuspendHomeSleep: + // If the applet allows suspension, we can remove the forced state. + m_suspend_mode = SuspendMode::NoOverride; + break; + + case FocusHandlingMode::NoSuspend: + // If the applet is not an application, we can remove the forced state. + // Only applications can be forced to resume. + if (!m_is_application) { + m_suspend_mode = SuspendMode::NoOverride; + } + } +} + +bool LifecycleManager::IsRunnable() const { + // If suspend is forced, return that. + if (m_forced_suspend) { + return false; + } + + // Check suspend mode override. + switch (m_suspend_mode) { + case SuspendMode::NoOverride: + // Continue processing. + break; + + case SuspendMode::ForceResume: + // The applet is runnable during forced resumption when its exit is requested. + return m_has_requested_exit; + + case SuspendMode::ForceSuspend: + // The applet is never runnable during forced suspension. + return false; + } + + // Always run if exit is requested. + if (m_has_requested_exit) { + return true; + } + + if (m_activity_state == ActivityState::ForegroundVisible) { + // The applet is runnable now. + return true; + } + + if (m_activity_state == ActivityState::ForegroundObscured) { + switch (m_focus_handling_mode) { + case FocusHandlingMode::AlwaysSuspend: + // The applet is not runnable while running the applet. + return false; + + case FocusHandlingMode::SuspendHomeSleep: + // The applet is runnable while running the applet. + return true; + + case FocusHandlingMode::NoSuspend: + // The applet is always runnable. + return true; + } + } + + // The activity is a suspended one. + // The applet should be suspended unless it has disabled suspension. + return m_focus_handling_mode == FocusHandlingMode::NoSuspend; +} + +FocusState LifecycleManager::GetFocusStateWhileForegroundObscured() const { + switch (m_focus_handling_mode) { + case FocusHandlingMode::AlwaysSuspend: + // The applet never learns it has lost focus. + return FocusState::InFocus; + + case FocusHandlingMode::SuspendHomeSleep: + // The applet learns it has lost focus when launching a child applet. + return FocusState::NotInFocus; + + case FocusHandlingMode::NoSuspend: + // The applet always learns it has lost focus. + return FocusState::NotInFocus; + + default: + UNREACHABLE(); + } +} + +FocusState LifecycleManager::GetFocusStateWhileBackground(bool is_obscured) const { + switch (m_focus_handling_mode) { + case FocusHandlingMode::AlwaysSuspend: + // The applet never learns it has lost focus. + return FocusState::InFocus; + + case FocusHandlingMode::SuspendHomeSleep: + // The applet learns it has lost focus when launching a child applet. + return is_obscured ? FocusState::NotInFocus : FocusState::InFocus; + + case FocusHandlingMode::NoSuspend: + // The applet always learns it has lost focus. + return m_is_application ? FocusState::Background : FocusState::NotInFocus; + + default: + UNREACHABLE(); + } +} + +bool LifecycleManager::UpdateRequestedFocusState() { + FocusState new_state{}; + + if (m_suspend_mode == SuspendMode::NoOverride) { + // With no forced suspend or resume, we take the focus state designated + // by the combination of the activity flag and the focus handling mode. + switch (m_activity_state) { + case ActivityState::ForegroundVisible: + new_state = FocusState::InFocus; + break; + + case ActivityState::ForegroundObscured: + new_state = this->GetFocusStateWhileForegroundObscured(); + break; + + case ActivityState::BackgroundVisible: + new_state = this->GetFocusStateWhileBackground(false); + break; + + case ActivityState::BackgroundObscured: + new_state = this->GetFocusStateWhileBackground(true); + break; + + default: + UNREACHABLE(); + } + } else { + // With forced suspend or resume, the applet is guaranteed to be background. + new_state = this->GetFocusStateWhileBackground(false); + } + + if (new_state != m_requested_focus_state) { + // Mark the focus state as ready for update. + m_requested_focus_state = new_state; + + // We changed the focus state. + return true; + } + + // We didn't change the focus state. + return false; +} + +} // namespace Service::AM diff --git a/src/core/hle/service/am/lifecycle_manager.h b/src/core/hle/service/am/lifecycle_manager.h new file mode 100644 index 000000000..7c70434a1 --- /dev/null +++ b/src/core/hle/service/am/lifecycle_manager.h @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "core/hle/service/am/am_types.h" +#include "core/hle/service/os/event.h" + +namespace Core { +class System; +} + +namespace Service::AM { + +enum class ActivityState : u32 { + ForegroundVisible = 0, + ForegroundObscured = 1, + BackgroundVisible = 2, + BackgroundObscured = 3, +}; + +enum class FocusHandlingMode : u32 { + AlwaysSuspend = 0, + SuspendHomeSleep = 1, + NoSuspend = 2, +}; + +enum class SuspendMode : u32 { + NoOverride = 0, + ForceResume = 1, + ForceSuspend = 2, +}; + +class LifecycleManager { +public: + explicit LifecycleManager(Core::System& system, KernelHelpers::ServiceContext& context, + bool is_application); + ~LifecycleManager(); + +public: + Event& GetSystemEvent(); + Event& GetOperationModeChangedSystemEvent(); + +public: + bool IsApplication() { + return m_is_application; + } + + bool GetForcedSuspend() { + return m_forced_suspend; + } + + bool GetExitRequested() { + return m_has_requested_exit; + } + + ActivityState GetActivityState() { + return m_activity_state; + } + + FocusState GetAndClearFocusState() { + m_acknowledged_focus_state = m_requested_focus_state; + return m_acknowledged_focus_state; + } + + void SetFocusState(FocusState state) { + if (m_requested_focus_state != state) { + m_has_focus_state_changed = true; + } + m_requested_focus_state = state; + this->SignalSystemEventIfNeeded(); + } + + void RequestExit() { + m_has_requested_exit = true; + this->SignalSystemEventIfNeeded(); + } + + void RequestResumeNotification() { + // NOTE: this appears to be a bug in am. + // If an applet makes a concurrent request to receive resume notifications + // while it is being suspended, the first resume notification will be lost. + // This is not the case with other notification types. + if (m_resume_notification_enabled) { + m_has_resume = true; + } + } + + void OnOperationAndPerformanceModeChanged(); + +public: + void SetFocusStateChangedNotificationEnabled(bool enabled) { + m_focus_state_changed_notification_enabled = enabled; + this->SignalSystemEventIfNeeded(); + } + + void SetOperationModeChangedNotificationEnabled(bool enabled) { + m_operation_mode_changed_notification_enabled = enabled; + this->SignalSystemEventIfNeeded(); + } + + void SetPerformanceModeChangedNotificationEnabled(bool enabled) { + m_performance_mode_changed_notification_enabled = enabled; + this->SignalSystemEventIfNeeded(); + } + + void SetResumeNotificationEnabled(bool enabled) { + m_resume_notification_enabled = enabled; + } + + void SetActivityState(ActivityState state) { + m_activity_state = state; + } + + void SetSuspendMode(SuspendMode mode) { + m_suspend_mode = mode; + } + + void SetForcedSuspend(bool enabled) { + m_forced_suspend = enabled; + } + +public: + void SetFocusHandlingMode(bool suspend); + void SetOutOfFocusSuspendingEnabled(bool enabled); + void RemoveForceResumeIfPossible(); + bool IsRunnable() const; + bool UpdateRequestedFocusState(); + void SignalSystemEventIfNeeded(); + +public: + void PushUnorderedMessage(AppletMessage message); + bool PopMessage(AppletMessage* out_message); + +private: + FocusState GetFocusStateWhileForegroundObscured() const; + FocusState GetFocusStateWhileBackground(bool is_obscured) const; + +private: + AppletMessage PopMessageInOrderOfPriority(); + bool ShouldSignalSystemEvent(); + +private: + Event m_system_event; + Event m_operation_mode_changed_system_event; + + std::list m_unordered_messages{}; + + bool m_is_application{}; + bool m_focus_state_changed_notification_enabled{true}; + bool m_operation_mode_changed_notification_enabled{true}; + bool m_performance_mode_changed_notification_enabled{true}; + bool m_resume_notification_enabled{}; + + bool m_requested_request_to_display_state{}; + bool m_acknowledged_request_to_display_state{}; + bool m_has_resume{}; + bool m_has_focus_state_changed{true}; + bool m_has_album_recording_saved{}; + bool m_has_album_screen_shot_taken{}; + bool m_has_auto_power_down{}; + bool m_has_sleep_required_by_low_battery{}; + bool m_has_sleep_required_by_high_temperature{}; + bool m_has_sd_card_removed{}; + bool m_has_performance_mode_changed{}; + bool m_has_operation_mode_changed{}; + bool m_has_requested_request_to_prepare_sleep{}; + bool m_has_acknowledged_request_to_prepare_sleep{}; + bool m_has_requested_exit{}; + bool m_has_acknowledged_exit{}; + bool m_applet_message_available{}; + + bool m_forced_suspend{}; + FocusHandlingMode m_focus_handling_mode{FocusHandlingMode::SuspendHomeSleep}; + ActivityState m_activity_state{ActivityState::ForegroundVisible}; + SuspendMode m_suspend_mode{SuspendMode::NoOverride}; + FocusState m_requested_focus_state{}; + FocusState m_acknowledged_focus_state{}; +}; + +} // namespace Service::AM diff --git a/src/core/hle/service/am/process_creation.cpp b/src/core/hle/service/am/process_creation.cpp new file mode 100644 index 000000000..237151d06 --- /dev/null +++ b/src/core/hle/service/am/process_creation.cpp @@ -0,0 +1,130 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "core/core.h" +#include "core/file_sys/content_archive.h" +#include "core/file_sys/nca_metadata.h" +#include "core/file_sys/patch_manager.h" +#include "core/file_sys/registered_cache.h" +#include "core/file_sys/romfs_factory.h" +#include "core/hle/service/am/process_creation.h" +#include "core/hle/service/glue/glue_manager.h" +#include "core/hle/service/os/process.h" +#include "core/loader/loader.h" + +namespace Service::AM { + +namespace { + +FileSys::StorageId GetStorageIdForFrontendSlot( + std::optional slot) { + if (!slot.has_value()) { + return FileSys::StorageId::None; + } + + switch (*slot) { + case FileSys::ContentProviderUnionSlot::UserNAND: + return FileSys::StorageId::NandUser; + case FileSys::ContentProviderUnionSlot::SysNAND: + return FileSys::StorageId::NandSystem; + case FileSys::ContentProviderUnionSlot::SDMC: + return FileSys::StorageId::SdCard; + case FileSys::ContentProviderUnionSlot::FrontendManual: + return FileSys::StorageId::Host; + default: + return FileSys::StorageId::None; + } +} + +std::unique_ptr CreateProcessImpl(std::unique_ptr& out_loader, + Loader::ResultStatus& out_load_result, + Core::System& system, FileSys::VirtualFile file, + u64 program_id, u64 program_index) { + // Get the appropriate loader to parse this NCA. + out_loader = Loader::GetLoader(system, file, program_id, program_index); + + // Ensure we have a loader which can parse the NCA. + if (!out_loader) { + return nullptr; + } + + // Try to load the process. + auto process = std::make_unique(system); + if (process->Initialize(*out_loader, out_load_result)) { + return process; + } + + return nullptr; +} + +} // Anonymous namespace + +std::unique_ptr CreateProcess(Core::System& system, u64 program_id, + u8 minimum_key_generation, u8 maximum_key_generation) { + // Attempt to load program NCA. + FileSys::VirtualFile nca_raw{}; + + // Get the program NCA from storage. + auto& storage = system.GetContentProviderUnion(); + nca_raw = storage.GetEntryRaw(program_id, FileSys::ContentRecordType::Program); + + // Ensure we retrieved a program NCA. + if (!nca_raw) { + return nullptr; + } + + // Ensure we have a suitable version. + if (minimum_key_generation > 0) { + FileSys::NCA nca(nca_raw); + if (nca.GetStatus() == Loader::ResultStatus::Success && + (nca.GetKeyGeneration() < minimum_key_generation || + nca.GetKeyGeneration() > maximum_key_generation)) { + LOG_WARNING(Service_LDR, "Skipping program {:016X} with generation {}", program_id, + nca.GetKeyGeneration()); + return nullptr; + } + } + + std::unique_ptr loader; + Loader::ResultStatus status; + return CreateProcessImpl(loader, status, system, nca_raw, program_id, 0); +} + +std::unique_ptr CreateApplicationProcess(std::vector& out_control, + std::unique_ptr& out_loader, + Loader::ResultStatus& out_load_result, + Core::System& system, FileSys::VirtualFile file, + u64 program_id, u64 program_index) { + auto process = + CreateProcessImpl(out_loader, out_load_result, system, file, program_id, program_index); + if (!process) { + return nullptr; + } + + FileSys::NACP nacp; + if (out_loader->ReadControlData(nacp) == Loader::ResultStatus::Success) { + out_control = nacp.GetRawBytes(); + } else { + out_control.resize(sizeof(FileSys::RawNACP)); + } + + auto& storage = system.GetContentProviderUnion(); + Service::Glue::ApplicationLaunchProperty launch{}; + launch.title_id = process->GetProgramId(); + + FileSys::PatchManager pm{launch.title_id, system.GetFileSystemController(), storage}; + launch.version = pm.GetGameVersion().value_or(0); + + // TODO(DarkLordZach): When FSController/Game Card Support is added, if + // current_process_game_card use correct StorageId + launch.base_game_storage_id = GetStorageIdForFrontendSlot( + storage.GetSlotForEntry(launch.title_id, FileSys::ContentRecordType::Program)); + launch.update_storage_id = GetStorageIdForFrontendSlot(storage.GetSlotForEntry( + FileSys::GetUpdateTitleID(launch.title_id), FileSys::ContentRecordType::Program)); + + system.GetARPManager().Register(launch.title_id, launch, out_control); + + return process; +} + +} // namespace Service::AM diff --git a/src/core/hle/service/am/process_creation.h b/src/core/hle/service/am/process_creation.h new file mode 100644 index 000000000..8cfb9e0c9 --- /dev/null +++ b/src/core/hle/service/am/process_creation.h @@ -0,0 +1,35 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +#include "common/common_types.h" +#include "core/file_sys/vfs/vfs_types.h" + +namespace Core { +class System; +} + +namespace Loader { +class AppLoader; +enum class ResultStatus : u16; +} // namespace Loader + +namespace Service { +class Process; +} + +namespace Service::AM { + +std::unique_ptr CreateProcess(Core::System& system, u64 program_id, + u8 minimum_key_generation, u8 maximum_key_generation); +std::unique_ptr CreateApplicationProcess(std::vector& out_control, + std::unique_ptr& out_loader, + Loader::ResultStatus& out_load_result, + Core::System& system, FileSys::VirtualFile file, + u64 program_id, u64 program_index); + +} // namespace Service::AM diff --git a/src/core/hle/service/am/process_holder.cpp b/src/core/hle/service/am/process_holder.cpp new file mode 100644 index 000000000..21ef5bf83 --- /dev/null +++ b/src/core/hle/service/am/process_holder.cpp @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "core/hle/kernel/k_process.h" +#include "core/hle/service/am/process_holder.h" +#include "core/hle/service/os/process.h" + +namespace Service::AM { + +ProcessHolder::ProcessHolder(Applet& applet, Process& process) + : MultiWaitHolder(process.GetHandle()), m_applet(applet), m_process(process) {} + +ProcessHolder::~ProcessHolder() = default; + +} // namespace Service::AM diff --git a/src/core/hle/service/am/process_holder.h b/src/core/hle/service/am/process_holder.h new file mode 100644 index 000000000..3a9b81dfb --- /dev/null +++ b/src/core/hle/service/am/process_holder.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "core/hle/service/os/multi_wait_holder.h" + +namespace Service { +class Process; +} + +namespace Service::AM { + +struct Applet; + +class ProcessHolder : public MultiWaitHolder, public Common::IntrusiveListBaseNode { +public: + explicit ProcessHolder(Applet& applet, Process& process); + ~ProcessHolder(); + + Applet& GetApplet() const { + return m_applet; + } + + Process& GetProcess() const { + return m_process; + } + +private: + Applet& m_applet; + Process& m_process; +}; + +} // namespace Service::AM diff --git a/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp b/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp index 21747783a..bc9c86c55 100644 --- a/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp +++ b/src/core/hle/service/am/service/all_system_applet_proxies_service.cpp @@ -6,12 +6,14 @@ #include "core/hle/service/am/service/all_system_applet_proxies_service.h" #include "core/hle/service/am/service/library_applet_proxy.h" #include "core/hle/service/am/service/system_applet_proxy.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/cmif_serialization.h" namespace Service::AM { -IAllSystemAppletProxiesService::IAllSystemAppletProxiesService(Core::System& system_) - : ServiceFramework{system_, "appletAE"} { +IAllSystemAppletProxiesService::IAllSystemAppletProxiesService(Core::System& system_, + WindowSystem& window_system) + : ServiceFramework{system_, "appletAE"}, m_window_system{window_system} { // clang-format off static const FunctionInfo functions[] = { {100, D<&IAllSystemAppletProxiesService::OpenSystemAppletProxy>, "OpenSystemAppletProxy"}, @@ -36,8 +38,8 @@ Result IAllSystemAppletProxiesService::OpenSystemAppletProxy( LOG_DEBUG(Service_AM, "called"); if (const auto applet = this->GetAppletFromProcessId(pid); applet) { - *out_system_applet_proxy = - std::make_shared(system, applet, process_handle.Get()); + *out_system_applet_proxy = std::make_shared( + system, applet, process_handle.Get(), m_window_system); R_SUCCEED(); } else { UNIMPLEMENTED(); @@ -52,8 +54,8 @@ Result IAllSystemAppletProxiesService::OpenLibraryAppletProxy( LOG_DEBUG(Service_AM, "called"); if (const auto applet = this->GetAppletFromProcessId(pid); applet) { - *out_library_applet_proxy = - std::make_shared(system, applet, process_handle.Get()); + *out_library_applet_proxy = std::make_shared( + system, applet, process_handle.Get(), m_window_system); R_SUCCEED(); } else { UNIMPLEMENTED(); @@ -73,7 +75,7 @@ Result IAllSystemAppletProxiesService::OpenLibraryAppletProxyOld( std::shared_ptr IAllSystemAppletProxiesService::GetAppletFromProcessId( ProcessId process_id) { - return system.GetAppletManager().GetByAppletResourceUserId(process_id.pid); + return m_window_system.GetByAppletResourceUserId(process_id.pid); } } // namespace Service::AM diff --git a/src/core/hle/service/am/service/all_system_applet_proxies_service.h b/src/core/hle/service/am/service/all_system_applet_proxies_service.h index 0e2dcb86d..e3e79dc4f 100644 --- a/src/core/hle/service/am/service/all_system_applet_proxies_service.h +++ b/src/core/hle/service/am/service/all_system_applet_proxies_service.h @@ -14,11 +14,12 @@ struct Applet; struct AppletAttribute; class ILibraryAppletProxy; class ISystemAppletProxy; +class WindowSystem; class IAllSystemAppletProxiesService final : public ServiceFramework { public: - explicit IAllSystemAppletProxiesService(Core::System& system_); + explicit IAllSystemAppletProxiesService(Core::System& system_, WindowSystem& window_system); ~IAllSystemAppletProxiesService() override; private: @@ -35,6 +36,8 @@ private: private: std::shared_ptr GetAppletFromProcessId(ProcessId pid); + + WindowSystem& m_window_system; }; } // namespace AM diff --git a/src/core/hle/service/am/service/applet_common_functions.cpp b/src/core/hle/service/am/service/applet_common_functions.cpp index 0f29ab285..a051000af 100644 --- a/src/core/hle/service/am/service/applet_common_functions.cpp +++ b/src/core/hle/service/am/service/applet_common_functions.cpp @@ -19,7 +19,7 @@ IAppletCommonFunctions::IAppletCommonFunctions(Core::System& system_, {21, nullptr, "TryPopFromAppletBoundChannel"}, {40, nullptr, "GetDisplayLogicalResolution"}, {42, nullptr, "SetDisplayMagnification"}, - {50, nullptr, "SetHomeButtonDoubleClickEnabled"}, + {50, D<&IAppletCommonFunctions::SetHomeButtonDoubleClickEnabled>, "SetHomeButtonDoubleClickEnabled"}, {51, D<&IAppletCommonFunctions::GetHomeButtonDoubleClickEnabled>, "GetHomeButtonDoubleClickEnabled"}, {52, nullptr, "IsHomeButtonShortPressedBlocked"}, {60, nullptr, "IsVrModeCurtainRequired"}, @@ -40,6 +40,13 @@ IAppletCommonFunctions::IAppletCommonFunctions(Core::System& system_, IAppletCommonFunctions::~IAppletCommonFunctions() = default; +Result IAppletCommonFunctions::SetHomeButtonDoubleClickEnabled( + bool home_button_double_click_enabled) { + LOG_WARNING(Service_AM, "(STUBBED) called, home_button_double_click_enabled={}", + home_button_double_click_enabled); + R_SUCCEED(); +} + Result IAppletCommonFunctions::GetHomeButtonDoubleClickEnabled( Out out_home_button_double_click_enabled) { LOG_WARNING(Service_AM, "(STUBBED) called"); diff --git a/src/core/hle/service/am/service/applet_common_functions.h b/src/core/hle/service/am/service/applet_common_functions.h index 4424fc83d..376f85acf 100644 --- a/src/core/hle/service/am/service/applet_common_functions.h +++ b/src/core/hle/service/am/service/applet_common_functions.h @@ -16,6 +16,7 @@ public: ~IAppletCommonFunctions() override; private: + Result SetHomeButtonDoubleClickEnabled(bool home_button_double_click_enabled); Result GetHomeButtonDoubleClickEnabled(Out out_home_button_double_click_enabled); Result SetCpuBoostRequestPriority(s32 priority); Result GetCurrentApplicationId(Out out_application_id); diff --git a/src/core/hle/service/am/service/application_accessor.cpp b/src/core/hle/service/am/service/application_accessor.cpp index 6e7d110e8..986abc716 100644 --- a/src/core/hle/service/am/service/application_accessor.cpp +++ b/src/core/hle/service/am/service/application_accessor.cpp @@ -9,12 +9,16 @@ #include "core/hle/service/am/service/application_accessor.h" #include "core/hle/service/am/service/library_applet_accessor.h" #include "core/hle/service/am/service/storage.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/cmif_serialization.h" +#include "core/hle/service/glue/glue_manager.h" namespace Service::AM { -IApplicationAccessor::IApplicationAccessor(Core::System& system_, std::shared_ptr applet) - : ServiceFramework{system_, "IApplicationAccessor"}, m_applet(std::move(applet)) { +IApplicationAccessor::IApplicationAccessor(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system) + : ServiceFramework{system_, "IApplicationAccessor"}, m_window_system(window_system), + m_applet(std::move(applet)) { // clang-format off static const FunctionInfo functions[] = { {0, D<&IApplicationAccessor::GetAppletStateChangedEvent>, "GetAppletStateChangedEvent"}, @@ -59,7 +63,15 @@ Result IApplicationAccessor::Start() { Result IApplicationAccessor::RequestExit() { LOG_INFO(Service_AM, "called"); - m_applet->message_queue.RequestExit(); + + std::scoped_lock lk{m_applet->lock}; + if (m_applet->exit_locked) { + m_applet->lifecycle_manager.RequestExit(); + m_applet->UpdateSuspensionStateLocked(true); + } else { + m_applet->process->Terminate(); + } + R_SUCCEED(); } @@ -71,13 +83,14 @@ Result IApplicationAccessor::Terminate() { Result IApplicationAccessor::GetResult() { LOG_INFO(Service_AM, "called"); - R_SUCCEED(); + std::scoped_lock lk{m_applet->lock}; + R_RETURN(m_applet->terminate_result); } Result IApplicationAccessor::GetAppletStateChangedEvent( OutCopyHandle out_event) { LOG_INFO(Service_AM, "called"); - *out_event = m_applet->caller_applet_broker->GetStateChangedEvent().GetHandle(); + *out_event = m_applet->state_changed_event.GetHandle(); R_SUCCEED(); } @@ -96,8 +109,15 @@ Result IApplicationAccessor::PushLaunchParameter(LaunchParameterKind kind, Result IApplicationAccessor::GetApplicationControlProperty( OutBuffer out_control_property) { - LOG_WARNING(Service_AM, "(STUBBED) called"); - R_THROW(ResultUnknown); + LOG_INFO(Service_AM, "called"); + + std::vector nacp; + R_TRY(system.GetARPManager().GetControlProperty(&nacp, m_applet->program_id)); + + std::memcpy(out_control_property.data(), nacp.data(), + std::min(out_control_property.size(), nacp.size())); + + R_SUCCEED(); } Result IApplicationAccessor::SetUsers(bool enable, @@ -114,8 +134,9 @@ Result IApplicationAccessor::GetCurrentLibraryApplet( } Result IApplicationAccessor::RequestForApplicationToGetForeground() { - LOG_WARNING(Service_AM, "(STUBBED) called"); - R_THROW(ResultUnknown); + LOG_INFO(Service_AM, "called"); + m_window_system.RequestApplicationToGetForeground(); + R_SUCCEED(); } Result IApplicationAccessor::CheckRightsEnvironmentAvailable(Out out_is_available) { diff --git a/src/core/hle/service/am/service/application_accessor.h b/src/core/hle/service/am/service/application_accessor.h index 39a9b2153..b9797bcc0 100644 --- a/src/core/hle/service/am/service/application_accessor.h +++ b/src/core/hle/service/am/service/application_accessor.h @@ -13,10 +13,12 @@ namespace Service::AM { struct Applet; class ILibraryAppletAccessor; class IStorage; +class WindowSystem; class IApplicationAccessor final : public ServiceFramework { public: - explicit IApplicationAccessor(Core::System& system_, std::shared_ptr applet); + explicit IApplicationAccessor(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system); ~IApplicationAccessor() override; private: @@ -34,6 +36,7 @@ private: Result GetNsRightsEnvironmentHandle(Out out_handle); Result ReportApplicationExitTimeout(); + WindowSystem& m_window_system; const std::shared_ptr m_applet; }; diff --git a/src/core/hle/service/am/service/application_creator.cpp b/src/core/hle/service/am/service/application_creator.cpp index 568bb0122..8994f1914 100644 --- a/src/core/hle/service/am/service/application_creator.cpp +++ b/src/core/hle/service/am/service/application_creator.cpp @@ -1,17 +1,57 @@ // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "core/file_sys/nca_metadata.h" +#include "core/file_sys/registered_cache.h" #include "core/hle/service/am/am_types.h" #include "core/hle/service/am/applet.h" #include "core/hle/service/am/applet_manager.h" +#include "core/hle/service/am/process_creation.h" #include "core/hle/service/am/service/application_accessor.h" #include "core/hle/service/am/service/application_creator.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/cmif_serialization.h" +#include "core/loader/loader.h" namespace Service::AM { -IApplicationCreator::IApplicationCreator(Core::System& system_) - : ServiceFramework{system_, "IApplicationCreator"} { +namespace { + +Result CreateGuestApplication(SharedPointer* out_application_accessor, + Core::System& system, WindowSystem& window_system, u64 program_id) { + FileSys::VirtualFile nca_raw{}; + + // Get the program NCA from storage. + auto& storage = system.GetContentProviderUnion(); + nca_raw = storage.GetEntryRaw(program_id, FileSys::ContentRecordType::Program); + + // Ensure we retrieved a program NCA. + R_UNLESS(nca_raw != nullptr, ResultUnknown); + + std::vector control; + std::unique_ptr loader; + Loader::ResultStatus result; + auto process = + CreateApplicationProcess(control, loader, result, system, nca_raw, program_id, 0); + R_UNLESS(process != nullptr, ResultUnknown); + + const auto applet = std::make_shared(system, std::move(process), true); + applet->program_id = program_id; + applet->applet_id = AppletId::Application; + applet->type = AppletType::Application; + applet->library_applet_mode = LibraryAppletMode::AllForeground; + + window_system.TrackApplet(applet, true); + + *out_application_accessor = + std::make_shared(system, applet, window_system); + R_SUCCEED(); +} + +} // namespace + +IApplicationCreator::IApplicationCreator(Core::System& system_, WindowSystem& window_system) + : ServiceFramework{system_, "IApplicationCreator"}, m_window_system{window_system} { // clang-format off static const FunctionInfo functions[] = { {0, D<&IApplicationCreator::CreateApplication>, "CreateApplication"}, @@ -28,8 +68,9 @@ IApplicationCreator::~IApplicationCreator() = default; Result IApplicationCreator::CreateApplication( Out> out_application_accessor, u64 application_id) { - LOG_ERROR(Service_NS, "called, application_id={:x}", application_id); - R_THROW(ResultUnknown); + LOG_INFO(Service_NS, "called, application_id={:016X}", application_id); + R_RETURN( + CreateGuestApplication(out_application_accessor, system, m_window_system, application_id)); } } // namespace Service::AM diff --git a/src/core/hle/service/am/service/application_creator.h b/src/core/hle/service/am/service/application_creator.h index 9f939ebf6..287745af8 100644 --- a/src/core/hle/service/am/service/application_creator.h +++ b/src/core/hle/service/am/service/application_creator.h @@ -10,14 +10,17 @@ namespace Service::AM { class IApplicationAccessor; struct Applet; +class WindowSystem; class IApplicationCreator final : public ServiceFramework { public: - explicit IApplicationCreator(Core::System& system_); + explicit IApplicationCreator(Core::System& system_, WindowSystem& window_system); ~IApplicationCreator() override; private: Result CreateApplication(Out>, u64 application_id); + + WindowSystem& m_window_system; }; } // namespace Service::AM diff --git a/src/core/hle/service/am/service/application_functions.cpp b/src/core/hle/service/am/service/application_functions.cpp index bfccb6b09..3bab5ac5f 100644 --- a/src/core/hle/service/am/service/application_functions.cpp +++ b/src/core/hle/service/am/service/application_functions.cpp @@ -181,7 +181,8 @@ Result IApplicationFunctions::GetDesiredLanguage(Out out_language_code) { } Result IApplicationFunctions::SetTerminateResult(Result terminate_result) { - LOG_INFO(Service_AM, "(STUBBED) called, result={:#x} ({}-{})", terminate_result.GetInnerValue(), + LOG_INFO(Service_AM, "(STUBBED) called, result={:#x} ({:04}-{:04})", + terminate_result.GetInnerValue(), static_cast(terminate_result.GetModule()) + 2000, terminate_result.GetDescription()); diff --git a/src/core/hle/service/am/service/application_proxy.cpp b/src/core/hle/service/am/service/application_proxy.cpp index 19d6a3b89..6e1328fee 100644 --- a/src/core/hle/service/am/service/application_proxy.cpp +++ b/src/core/hle/service/am/service/application_proxy.cpp @@ -17,9 +17,9 @@ namespace Service::AM { IApplicationProxy::IApplicationProxy(Core::System& system_, std::shared_ptr applet, - Kernel::KProcess* process) - : ServiceFramework{system_, "IApplicationProxy"}, m_process{process}, m_applet{ - std::move(applet)} { + Kernel::KProcess* process, WindowSystem& window_system) + : ServiceFramework{system_, "IApplicationProxy"}, + m_window_system{window_system}, m_process{process}, m_applet{std::move(applet)} { // clang-format off static const FunctionInfo functions[] = { {0, D<&IApplicationProxy::GetCommonStateGetter>, "GetCommonStateGetter"}, @@ -70,7 +70,7 @@ Result IApplicationProxy::GetDebugFunctions( Result IApplicationProxy::GetWindowController( Out> out_window_controller) { LOG_DEBUG(Service_AM, "called"); - *out_window_controller = std::make_shared(system, m_applet); + *out_window_controller = std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } @@ -91,7 +91,8 @@ Result IApplicationProxy::GetCommonStateGetter( Result IApplicationProxy::GetLibraryAppletCreator( Out> out_library_applet_creator) { LOG_DEBUG(Service_AM, "called"); - *out_library_applet_creator = std::make_shared(system, m_applet); + *out_library_applet_creator = + std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/application_proxy.h b/src/core/hle/service/am/service/application_proxy.h index 6da350df7..8c62459c4 100644 --- a/src/core/hle/service/am/service/application_proxy.h +++ b/src/core/hle/service/am/service/application_proxy.h @@ -18,11 +18,12 @@ class ILibraryAppletCreator; class IProcessWindingController; class ISelfController; class IWindowController; +class WindowSystem; class IApplicationProxy final : public ServiceFramework { public: explicit IApplicationProxy(Core::System& system_, std::shared_ptr applet, - Kernel::KProcess* process); + Kernel::KProcess* process, WindowSystem& window_system); ~IApplicationProxy(); private: @@ -40,6 +41,7 @@ private: Out> out_application_functions); private: + WindowSystem& m_window_system; Kernel::KProcess* const m_process; const std::shared_ptr m_applet; }; diff --git a/src/core/hle/service/am/service/application_proxy_service.cpp b/src/core/hle/service/am/service/application_proxy_service.cpp index fd66e77b9..b7d7b3c2d 100644 --- a/src/core/hle/service/am/service/application_proxy_service.cpp +++ b/src/core/hle/service/am/service/application_proxy_service.cpp @@ -6,12 +6,14 @@ #include "core/hle/service/am/applet_manager.h" #include "core/hle/service/am/service/application_proxy.h" #include "core/hle/service/am/service/application_proxy_service.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/cmif_serialization.h" namespace Service::AM { -IApplicationProxyService::IApplicationProxyService(Core::System& system_) - : ServiceFramework{system_, "appletOE"} { +IApplicationProxyService::IApplicationProxyService(Core::System& system_, + WindowSystem& window_system) + : ServiceFramework{system_, "appletOE"}, m_window_system{window_system} { static const FunctionInfo functions[] = { {0, D<&IApplicationProxyService::OpenApplicationProxy>, "OpenApplicationProxy"}, }; @@ -26,8 +28,8 @@ Result IApplicationProxyService::OpenApplicationProxy( LOG_DEBUG(Service_AM, "called"); if (const auto applet = this->GetAppletFromProcessId(pid)) { - *out_application_proxy = - std::make_shared(system, applet, process_handle.Get()); + *out_application_proxy = std::make_shared( + system, applet, process_handle.Get(), m_window_system); R_SUCCEED(); } else { UNIMPLEMENTED(); @@ -36,7 +38,7 @@ Result IApplicationProxyService::OpenApplicationProxy( } std::shared_ptr IApplicationProxyService::GetAppletFromProcessId(ProcessId process_id) { - return system.GetAppletManager().GetByAppletResourceUserId(process_id.pid); + return m_window_system.GetByAppletResourceUserId(process_id.pid); } } // namespace Service::AM diff --git a/src/core/hle/service/am/service/application_proxy_service.h b/src/core/hle/service/am/service/application_proxy_service.h index 8efafa31a..e5f4ea345 100644 --- a/src/core/hle/service/am/service/application_proxy_service.h +++ b/src/core/hle/service/am/service/application_proxy_service.h @@ -12,10 +12,11 @@ namespace AM { struct Applet; class IApplicationProxy; +class WindowSystem; class IApplicationProxyService final : public ServiceFramework { public: - explicit IApplicationProxyService(Core::System& system_); + explicit IApplicationProxyService(Core::System& system_, WindowSystem& window_system); ~IApplicationProxyService() override; private: @@ -24,6 +25,8 @@ private: private: std::shared_ptr GetAppletFromProcessId(ProcessId pid); + + WindowSystem& m_window_system; }; } // namespace AM diff --git a/src/core/hle/service/am/service/common_state_getter.cpp b/src/core/hle/service/am/service/common_state_getter.cpp index a32855ffa..f523bcd9e 100644 --- a/src/core/hle/service/am/service/common_state_getter.cpp +++ b/src/core/hle/service/am/service/common_state_getter.cpp @@ -80,15 +80,14 @@ ICommonStateGetter::~ICommonStateGetter() = default; Result ICommonStateGetter::GetEventHandle(OutCopyHandle out_event) { LOG_DEBUG(Service_AM, "called"); - *out_event = &m_applet->message_queue.GetMessageReceiveEvent(); + *out_event = m_applet->lifecycle_manager.GetSystemEvent().GetHandle(); R_SUCCEED(); } Result ICommonStateGetter::ReceiveMessage(Out out_applet_message) { LOG_DEBUG(Service_AM, "called"); - *out_applet_message = m_applet->message_queue.PopMessage(); - if (*out_applet_message == AppletMessage::None) { + if (!m_applet->lifecycle_manager.PopMessage(out_applet_message)) { LOG_ERROR(Service_AM, "Tried to pop message but none was available!"); R_THROW(AM::ResultNoMessages); } @@ -100,7 +99,7 @@ Result ICommonStateGetter::GetCurrentFocusState(Out out_focus_state) LOG_DEBUG(Service_AM, "called"); std::scoped_lock lk{m_applet->lock}; - *out_focus_state = m_applet->focus_state; + *out_focus_state = m_applet->lifecycle_manager.GetAndClearFocusState(); R_SUCCEED(); } @@ -137,7 +136,7 @@ Result ICommonStateGetter::GetWriterLockAccessorEx( Result ICommonStateGetter::GetDefaultDisplayResolutionChangeEvent( OutCopyHandle out_event) { LOG_DEBUG(Service_AM, "called"); - *out_event = &m_applet->message_queue.GetOperationModeChangedEvent(); + *out_event = m_applet->lifecycle_manager.GetOperationModeChangedSystemEvent().GetHandle(); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/home_menu_functions.cpp b/src/core/hle/service/am/service/home_menu_functions.cpp index 0c4d24b58..25f78beb5 100644 --- a/src/core/hle/service/am/service/home_menu_functions.cpp +++ b/src/core/hle/service/am/service/home_menu_functions.cpp @@ -4,13 +4,16 @@ #include "core/hle/result.h" #include "core/hle/service/am/applet_manager.h" #include "core/hle/service/am/service/home_menu_functions.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/cmif_serialization.h" namespace Service::AM { -IHomeMenuFunctions::IHomeMenuFunctions(Core::System& system_, std::shared_ptr applet) - : ServiceFramework{system_, "IHomeMenuFunctions"}, m_applet{std::move(applet)}, - m_context{system, "IHomeMenuFunctions"}, m_pop_from_general_channel_event{m_context} { +IHomeMenuFunctions::IHomeMenuFunctions(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system) + : ServiceFramework{system_, "IHomeMenuFunctions"}, m_window_system{window_system}, + m_applet{std::move(applet)}, m_context{system, "IHomeMenuFunctions"}, + m_pop_from_general_channel_event{m_context} { // clang-format off static const FunctionInfo functions[] = { {10, D<&IHomeMenuFunctions::RequestToGetForeground>, "RequestToGetForeground"}, @@ -37,17 +40,20 @@ IHomeMenuFunctions::IHomeMenuFunctions(Core::System& system_, std::shared_ptr { public: - explicit IHomeMenuFunctions(Core::System& system_, std::shared_ptr applet); + explicit IHomeMenuFunctions(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system); ~IHomeMenuFunctions() override; private: @@ -26,6 +28,7 @@ private: Result IsForceTerminateApplicationDisabledForDebug( Out out_is_force_terminate_application_disabled_for_debug); + WindowSystem& m_window_system; const std::shared_ptr m_applet; KernelHelpers::ServiceContext m_context; Event m_pop_from_general_channel_event; diff --git a/src/core/hle/service/am/service/library_applet_accessor.cpp b/src/core/hle/service/am/service/library_applet_accessor.cpp index 0c2426d4b..cda8c3eb8 100644 --- a/src/core/hle/service/am/service/library_applet_accessor.cpp +++ b/src/core/hle/service/am/service/library_applet_accessor.cpp @@ -47,20 +47,21 @@ ILibraryAppletAccessor::~ILibraryAppletAccessor() = default; Result ILibraryAppletAccessor::GetAppletStateChangedEvent( OutCopyHandle out_event) { LOG_DEBUG(Service_AM, "called"); - *out_event = m_broker->GetStateChangedEvent().GetHandle(); + *out_event = m_applet->state_changed_event.GetHandle(); R_SUCCEED(); } Result ILibraryAppletAccessor::IsCompleted(Out out_is_completed) { LOG_DEBUG(Service_AM, "called"); - *out_is_completed = m_broker->IsCompleted(); + std::scoped_lock lk{m_applet->lock}; + *out_is_completed = m_applet->is_completed; R_SUCCEED(); } -Result ILibraryAppletAccessor::GetResult(Out out_result) { +Result ILibraryAppletAccessor::GetResult() { LOG_DEBUG(Service_AM, "called"); - *out_result = m_applet->terminate_result; - R_SUCCEED(); + std::scoped_lock lk{m_applet->lock}; + R_RETURN(m_applet->terminate_result); } Result ILibraryAppletAccessor::PresetLibraryAppletGpuTimeSliceZero() { @@ -77,7 +78,10 @@ Result ILibraryAppletAccessor::Start() { Result ILibraryAppletAccessor::RequestExit() { LOG_DEBUG(Service_AM, "called"); - m_applet->message_queue.RequestExit(); + { + std::scoped_lock lk{m_applet->lock}; + m_applet->lifecycle_manager.RequestExit(); + } FrontendRequestExit(); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/library_applet_accessor.h b/src/core/hle/service/am/service/library_applet_accessor.h index 97d3b6c8a..36712821a 100644 --- a/src/core/hle/service/am/service/library_applet_accessor.h +++ b/src/core/hle/service/am/service/library_applet_accessor.h @@ -21,7 +21,7 @@ public: private: Result GetAppletStateChangedEvent(OutCopyHandle out_event); Result IsCompleted(Out out_is_completed); - Result GetResult(Out out_result); + Result GetResult(); Result PresetLibraryAppletGpuTimeSliceZero(); Result Start(); Result RequestExit(); diff --git a/src/core/hle/service/am/service/library_applet_creator.cpp b/src/core/hle/service/am/service/library_applet_creator.cpp index c97358d81..3ffb03bc9 100644 --- a/src/core/hle/service/am/service/library_applet_creator.cpp +++ b/src/core/hle/service/am/service/library_applet_creator.cpp @@ -7,9 +7,11 @@ #include "core/hle/service/am/applet_manager.h" #include "core/hle/service/am/frontend/applets.h" #include "core/hle/service/am/library_applet_storage.h" +#include "core/hle/service/am/process_creation.h" #include "core/hle/service/am/service/library_applet_accessor.h" #include "core/hle/service/am/service/library_applet_creator.h" #include "core/hle/service/am/service/storage.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/cmif_serialization.h" #include "core/hle/service/sm/sm.h" @@ -93,6 +95,7 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) { } std::shared_ptr CreateGuestApplet(Core::System& system, + WindowSystem& window_system, std::shared_ptr caller_applet, AppletId applet_id, LibraryAppletMode mode) { @@ -110,53 +113,38 @@ std::shared_ptr CreateGuestApplet(Core::System& system, Firmware1700 = 17, }; - auto process = std::make_unique(system); - if (!process->Initialize(program_id, Firmware1400, Firmware1700)) { + auto process = CreateProcess(system, program_id, Firmware1400, Firmware1700); + if (!process) { // Couldn't initialize the guest process return {}; } - const auto applet = std::make_shared(system, std::move(process)); + const auto applet = std::make_shared(system, std::move(process), false); applet->program_id = program_id; applet->applet_id = applet_id; applet->type = AppletType::LibraryApplet; applet->library_applet_mode = mode; - - // Set focus state - switch (mode) { - case LibraryAppletMode::AllForeground: - case LibraryAppletMode::NoUi: - case LibraryAppletMode::PartialForeground: - case LibraryAppletMode::PartialForegroundIndirectDisplay: - applet->hid_registration.EnableAppletToGetInput(true); - applet->focus_state = FocusState::InFocus; - applet->message_queue.PushMessage(AppletMessage::ChangeIntoForeground); - break; - case LibraryAppletMode::AllForegroundInitiallyHidden: - applet->hid_registration.EnableAppletToGetInput(false); - applet->focus_state = FocusState::NotInFocus; - applet->display_layer_manager.SetWindowVisibility(false); - applet->message_queue.PushMessage(AppletMessage::ChangeIntoBackground); - break; - } + applet->window_visible = mode != LibraryAppletMode::AllForegroundInitiallyHidden; auto broker = std::make_shared(system); applet->caller_applet = caller_applet; applet->caller_applet_broker = broker; + caller_applet->child_applets.push_back(applet); - system.GetAppletManager().InsertApplet(applet); + window_system.TrackApplet(applet, false); return std::make_shared(system, broker, applet); } std::shared_ptr CreateFrontendApplet(Core::System& system, + WindowSystem& window_system, std::shared_ptr caller_applet, AppletId applet_id, LibraryAppletMode mode) { const auto program_id = static_cast(AppletIdToProgramId(applet_id)); auto process = std::make_unique(system); - auto applet = std::make_shared(system, std::move(process)); + auto applet = std::make_shared(system, std::move(process), false); applet->program_id = program_id; applet->applet_id = applet_id; applet->type = AppletType::LibraryApplet; @@ -166,14 +154,19 @@ std::shared_ptr CreateFrontendApplet(Core::System& syste applet->caller_applet = caller_applet; applet->caller_applet_broker = storage; applet->frontend = system.GetFrontendAppletHolder().GetApplet(applet, applet_id, mode); + caller_applet->child_applets.push_back(applet); + + window_system.TrackApplet(applet, false); return std::make_shared(system, storage, applet); } } // namespace -ILibraryAppletCreator::ILibraryAppletCreator(Core::System& system_, std::shared_ptr applet) - : ServiceFramework{system_, "ILibraryAppletCreator"}, m_applet{std::move(applet)} { +ILibraryAppletCreator::ILibraryAppletCreator(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system) + : ServiceFramework{system_, "ILibraryAppletCreator"}, + m_window_system{window_system}, m_applet{std::move(applet)} { static const FunctionInfo functions[] = { {0, D<&ILibraryAppletCreator::CreateLibraryApplet>, "CreateLibraryApplet"}, {1, nullptr, "TerminateAllLibraryApplets"}, @@ -195,10 +188,12 @@ Result ILibraryAppletCreator::CreateLibraryApplet( std::shared_ptr library_applet; if (ShouldCreateGuestApplet(applet_id)) { - library_applet = CreateGuestApplet(system, m_applet, applet_id, library_applet_mode); + library_applet = + CreateGuestApplet(system, m_window_system, m_applet, applet_id, library_applet_mode); } if (!library_applet) { - library_applet = CreateFrontendApplet(system, m_applet, applet_id, library_applet_mode); + library_applet = + CreateFrontendApplet(system, m_window_system, m_applet, applet_id, library_applet_mode); } if (!library_applet) { LOG_ERROR(Service_AM, "Applet doesn't exist! applet_id={}", applet_id); diff --git a/src/core/hle/service/am/service/library_applet_creator.h b/src/core/hle/service/am/service/library_applet_creator.h index fe6d40eb3..a10a76982 100644 --- a/src/core/hle/service/am/service/library_applet_creator.h +++ b/src/core/hle/service/am/service/library_applet_creator.h @@ -12,10 +12,12 @@ namespace Service::AM { struct Applet; class ILibraryAppletAccessor; class IStorage; +class WindowSystem; class ILibraryAppletCreator final : public ServiceFramework { public: - explicit ILibraryAppletCreator(Core::System& system_, std::shared_ptr applet); + explicit ILibraryAppletCreator(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system); ~ILibraryAppletCreator() override; private: @@ -29,6 +31,7 @@ private: Result CreateHandleStorage(Out> out_storage, s64 size, InCopyHandle transfer_memory_handle); + WindowSystem& m_window_system; const std::shared_ptr m_applet; }; diff --git a/src/core/hle/service/am/service/library_applet_proxy.cpp b/src/core/hle/service/am/service/library_applet_proxy.cpp index 58e709347..f9cfb82a9 100644 --- a/src/core/hle/service/am/service/library_applet_proxy.cpp +++ b/src/core/hle/service/am/service/library_applet_proxy.cpp @@ -19,9 +19,9 @@ namespace Service::AM { ILibraryAppletProxy::ILibraryAppletProxy(Core::System& system_, std::shared_ptr applet, - Kernel::KProcess* process) - : ServiceFramework{system_, "ILibraryAppletProxy"}, m_process{process}, m_applet{ - std::move(applet)} { + Kernel::KProcess* process, WindowSystem& window_system) + : ServiceFramework{system_, "ILibraryAppletProxy"}, + m_window_system{window_system}, m_process{process}, m_applet{std::move(applet)} { // clang-format off static const FunctionInfo functions[] = { {0, D<&ILibraryAppletProxy::GetCommonStateGetter>, "GetCommonStateGetter"}, @@ -75,7 +75,7 @@ Result ILibraryAppletProxy::GetDebugFunctions( Result ILibraryAppletProxy::GetWindowController( Out> out_window_controller) { LOG_DEBUG(Service_AM, "called"); - *out_window_controller = std::make_shared(system, m_applet); + *out_window_controller = std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } @@ -96,7 +96,8 @@ Result ILibraryAppletProxy::GetCommonStateGetter( Result ILibraryAppletProxy::GetLibraryAppletCreator( Out> out_library_applet_creator) { LOG_DEBUG(Service_AM, "called"); - *out_library_applet_creator = std::make_shared(system, m_applet); + *out_library_applet_creator = + std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } @@ -118,7 +119,8 @@ Result ILibraryAppletProxy::GetAppletCommonFunctions( Result ILibraryAppletProxy::GetHomeMenuFunctions( Out> out_home_menu_functions) { LOG_DEBUG(Service_AM, "called"); - *out_home_menu_functions = std::make_shared(system, m_applet); + *out_home_menu_functions = + std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/library_applet_proxy.h b/src/core/hle/service/am/service/library_applet_proxy.h index 7d0714b85..792d58582 100644 --- a/src/core/hle/service/am/service/library_applet_proxy.h +++ b/src/core/hle/service/am/service/library_applet_proxy.h @@ -21,11 +21,12 @@ class ILibraryAppletSelfAccessor; class IProcessWindingController; class ISelfController; class IWindowController; +class WindowSystem; class ILibraryAppletProxy final : public ServiceFramework { public: explicit ILibraryAppletProxy(Core::System& system_, std::shared_ptr applet, - Kernel::KProcess* process); + Kernel::KProcess* process, WindowSystem& window_system); ~ILibraryAppletProxy(); private: @@ -47,6 +48,7 @@ private: Result GetGlobalStateController( Out> out_global_state_controller); + WindowSystem& m_window_system; Kernel::KProcess* const m_process; const std::shared_ptr m_applet; }; diff --git a/src/core/hle/service/am/service/library_applet_self_accessor.cpp b/src/core/hle/service/am/service/library_applet_self_accessor.cpp index 330eb26f0..3fe36b899 100644 --- a/src/core/hle/service/am/service/library_applet_self_accessor.cpp +++ b/src/core/hle/service/am/service/library_applet_self_accessor.cpp @@ -176,8 +176,7 @@ Result ILibraryAppletSelfAccessor::GetMainAppletStorageId(Outaruid); - m_broker->SignalCompletion(); + m_applet->process->Terminate(); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/self_controller.cpp b/src/core/hle/service/am/service/self_controller.cpp index 06314407c..1db02b88f 100644 --- a/src/core/hle/service/am/service/self_controller.cpp +++ b/src/core/hle/service/am/service/self_controller.cpp @@ -86,8 +86,7 @@ ISelfController::~ISelfController() { Result ISelfController::Exit() { LOG_DEBUG(Service_AM, "called"); - // TODO - system.Exit(); + m_applet->process->Terminate(); R_SUCCEED(); } @@ -95,7 +94,16 @@ Result ISelfController::Exit() { Result ISelfController::LockExit() { LOG_DEBUG(Service_AM, "called"); - system.SetExitLocked(true); + std::scoped_lock lk{m_applet->lock}; + + if (m_applet->lifecycle_manager.GetExitRequested()) { + // With exit already requested, ignore and terminate immediately. + m_applet->process->Terminate(); + } else { + // Otherwise, set exit lock state. + m_applet->exit_locked = true; + system.SetExitLocked(true); + } R_SUCCEED(); } @@ -103,10 +111,13 @@ Result ISelfController::LockExit() { Result ISelfController::UnlockExit() { LOG_DEBUG(Service_AM, "called"); + std::scoped_lock lk{m_applet->lock}; + + m_applet->exit_locked = false; system.SetExitLocked(false); - if (system.GetExitRequested()) { - system.Exit(); + if (m_applet->lifecycle_manager.GetExitRequested()) { + m_applet->process->Terminate(); } R_SUCCEED(); @@ -155,7 +166,7 @@ Result ISelfController::SetOperationModeChangedNotification(bool enabled) { LOG_INFO(Service_AM, "called, enabled={}", enabled); std::scoped_lock lk{m_applet->lock}; - m_applet->operation_mode_changed_notification_enabled = enabled; + m_applet->lifecycle_manager.SetOperationModeChangedNotificationEnabled(enabled); R_SUCCEED(); } @@ -164,17 +175,18 @@ Result ISelfController::SetPerformanceModeChangedNotification(bool enabled) { LOG_INFO(Service_AM, "called, enabled={}", enabled); std::scoped_lock lk{m_applet->lock}; - m_applet->performance_mode_changed_notification_enabled = enabled; + m_applet->lifecycle_manager.SetPerformanceModeChangedNotificationEnabled(enabled); R_SUCCEED(); } Result ISelfController::SetFocusHandlingMode(bool notify, bool background, bool suspend) { - LOG_WARNING(Service_AM, "(STUBBED) called, notify={} background={} suspend={}", notify, - background, suspend); + LOG_INFO(Service_AM, "called, notify={} background={} suspend={}", notify, background, suspend); std::scoped_lock lk{m_applet->lock}; - m_applet->focus_handling_mode = {notify, background, suspend}; + m_applet->lifecycle_manager.SetFocusStateChangedNotificationEnabled(notify); + m_applet->lifecycle_manager.SetFocusHandlingMode(suspend); + m_applet->UpdateSuspensionStateLocked(true); R_SUCCEED(); } @@ -183,7 +195,7 @@ Result ISelfController::SetRestartMessageEnabled(bool enabled) { LOG_INFO(Service_AM, "called, enabled={}", enabled); std::scoped_lock lk{m_applet->lock}; - m_applet->restart_message_enabled = enabled; + m_applet->lifecycle_manager.SetResumeNotificationEnabled(enabled); R_SUCCEED(); } @@ -202,7 +214,8 @@ Result ISelfController::SetOutOfFocusSuspendingEnabled(bool enabled) { LOG_INFO(Service_AM, "called, enabled={}", enabled); std::scoped_lock lk{m_applet->lock}; - m_applet->out_of_focus_suspension_enabled = enabled; + m_applet->lifecycle_manager.SetOutOfFocusSuspendingEnabled(enabled); + m_applet->UpdateSuspensionStateLocked(false); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/system_applet_proxy.cpp b/src/core/hle/service/am/service/system_applet_proxy.cpp index d1871ef9b..c435288a2 100644 --- a/src/core/hle/service/am/service/system_applet_proxy.cpp +++ b/src/core/hle/service/am/service/system_applet_proxy.cpp @@ -19,9 +19,9 @@ namespace Service::AM { ISystemAppletProxy::ISystemAppletProxy(Core::System& system_, std::shared_ptr applet, - Kernel::KProcess* process) - : ServiceFramework{system_, "ISystemAppletProxy"}, m_process{process}, m_applet{ - std::move(applet)} { + Kernel::KProcess* process, WindowSystem& window_system) + : ServiceFramework{system_, "ISystemAppletProxy"}, + m_window_system{window_system}, m_process{process}, m_applet{std::move(applet)} { // clang-format off static const FunctionInfo functions[] = { {0, D<&ISystemAppletProxy::GetCommonStateGetter>, "GetCommonStateGetter"}, @@ -75,7 +75,7 @@ Result ISystemAppletProxy::GetDebugFunctions( Result ISystemAppletProxy::GetWindowController( Out> out_window_controller) { LOG_DEBUG(Service_AM, "called"); - *out_window_controller = std::make_shared(system, m_applet); + *out_window_controller = std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } @@ -96,14 +96,15 @@ Result ISystemAppletProxy::GetCommonStateGetter( Result ISystemAppletProxy::GetLibraryAppletCreator( Out> out_library_applet_creator) { LOG_DEBUG(Service_AM, "called"); - *out_library_applet_creator = std::make_shared(system, m_applet); + *out_library_applet_creator = + std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } Result ISystemAppletProxy::GetApplicationCreator( Out> out_application_creator) { LOG_DEBUG(Service_AM, "called"); - *out_application_creator = std::make_shared(system); + *out_application_creator = std::make_shared(system, m_window_system); R_SUCCEED(); } @@ -117,7 +118,8 @@ Result ISystemAppletProxy::GetAppletCommonFunctions( Result ISystemAppletProxy::GetHomeMenuFunctions( Out> out_home_menu_functions) { LOG_DEBUG(Service_AM, "called"); - *out_home_menu_functions = std::make_shared(system, m_applet); + *out_home_menu_functions = + std::make_shared(system, m_applet, m_window_system); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/system_applet_proxy.h b/src/core/hle/service/am/service/system_applet_proxy.h index 67cd50e03..217d9dc8c 100644 --- a/src/core/hle/service/am/service/system_applet_proxy.h +++ b/src/core/hle/service/am/service/system_applet_proxy.h @@ -21,11 +21,12 @@ class ILibraryAppletCreator; class IProcessWindingController; class ISelfController; class IWindowController; +class WindowSystem; class ISystemAppletProxy final : public ServiceFramework { public: explicit ISystemAppletProxy(Core::System& system, std::shared_ptr applet, - Kernel::KProcess* process); + Kernel::KProcess* process, WindowSystem& window_system); ~ISystemAppletProxy(); private: @@ -46,6 +47,7 @@ private: Result GetGlobalStateController( Out> out_global_state_controller); + WindowSystem& m_window_system; Kernel::KProcess* const m_process; const std::shared_ptr m_applet; }; diff --git a/src/core/hle/service/am/service/window_controller.cpp b/src/core/hle/service/am/service/window_controller.cpp index 99a4f50a2..54396affb 100644 --- a/src/core/hle/service/am/service/window_controller.cpp +++ b/src/core/hle/service/am/service/window_controller.cpp @@ -4,12 +4,15 @@ #include "core/hle/service/am/applet.h" #include "core/hle/service/am/applet_manager.h" #include "core/hle/service/am/service/window_controller.h" +#include "core/hle/service/am/window_system.h" #include "core/hle/service/cmif_serialization.h" namespace Service::AM { -IWindowController::IWindowController(Core::System& system_, std::shared_ptr applet) - : ServiceFramework{system_, "IWindowController"}, m_applet{std::move(applet)} { +IWindowController::IWindowController(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system) + : ServiceFramework{system_, "IWindowController"}, + m_window_system{window_system}, m_applet{std::move(applet)} { // clang-format off static const FunctionInfo functions[] = { {0, nullptr, "CreateWindow"}, @@ -63,17 +66,9 @@ Result IWindowController::RejectToChangeIntoBackground() { } Result IWindowController::SetAppletWindowVisibility(bool visible) { - m_applet->display_layer_manager.SetWindowVisibility(visible); - m_applet->hid_registration.EnableAppletToGetInput(visible); + LOG_INFO(Service_AM, "called"); - if (visible) { - m_applet->message_queue.PushMessage(AppletMessage::ChangeIntoForeground); - m_applet->focus_state = FocusState::InFocus; - } else { - m_applet->focus_state = FocusState::NotInFocus; - } - - m_applet->message_queue.PushMessage(AppletMessage::FocusStateChanged); + m_window_system.RequestAppletVisibilityState(*m_applet, visible); R_SUCCEED(); } diff --git a/src/core/hle/service/am/service/window_controller.h b/src/core/hle/service/am/service/window_controller.h index bfbad9bcc..a784dd4a4 100644 --- a/src/core/hle/service/am/service/window_controller.h +++ b/src/core/hle/service/am/service/window_controller.h @@ -9,10 +9,12 @@ namespace Service::AM { struct Applet; +class WindowSystem; class IWindowController final : public ServiceFramework { public: - explicit IWindowController(Core::System& system_, std::shared_ptr applet); + explicit IWindowController(Core::System& system_, std::shared_ptr applet, + WindowSystem& window_system); ~IWindowController() override; private: @@ -24,6 +26,7 @@ private: Result SetAppletWindowVisibility(bool visible); Result SetAppletGpuTimeSlice(s64 time_slice); + WindowSystem& m_window_system; const std::shared_ptr m_applet; }; diff --git a/src/core/hle/service/am/window_system.cpp b/src/core/hle/service/am/window_system.cpp new file mode 100644 index 000000000..5cf24007c --- /dev/null +++ b/src/core/hle/service/am/window_system.cpp @@ -0,0 +1,315 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "core/core.h" +#include "core/hle/service/am/am_results.h" +#include "core/hle/service/am/applet.h" +#include "core/hle/service/am/applet_manager.h" +#include "core/hle/service/am/event_observer.h" +#include "core/hle/service/am/window_system.h" + +namespace Service::AM { + +WindowSystem::WindowSystem(Core::System& system) : m_system(system) {} + +WindowSystem::~WindowSystem() { + m_system.GetAppletManager().SetWindowSystem(nullptr); +} + +void WindowSystem::SetEventObserver(EventObserver* observer) { + m_event_observer = observer; + m_system.GetAppletManager().SetWindowSystem(this); +} + +void WindowSystem::Update() { + std::scoped_lock lk{m_lock}; + + // Loop through all applets and remove terminated applets. + this->PruneTerminatedAppletsLocked(); + + // If the home menu is being locked into the foreground, handle that. + if (this->LockHomeMenuIntoForegroundLocked()) { + return; + } + + // Recursively update each applet root. + this->UpdateAppletStateLocked(m_home_menu, m_foreground_requested_applet == m_home_menu); + this->UpdateAppletStateLocked(m_application, m_foreground_requested_applet == m_application); +} + +void WindowSystem::TrackApplet(std::shared_ptr applet, bool is_application) { + std::scoped_lock lk{m_lock}; + + if (applet->applet_id == AppletId::QLaunch) { + ASSERT(m_home_menu == nullptr); + m_home_menu = applet.get(); + } else if (is_application) { + ASSERT(m_application == nullptr); + m_application = applet.get(); + } + + m_event_observer->TrackAppletProcess(*applet); + m_applets.emplace(applet->aruid.pid, std::move(applet)); +} + +std::shared_ptr WindowSystem::GetByAppletResourceUserId(u64 aruid) { + std::scoped_lock lk{m_lock}; + + const auto it = m_applets.find(aruid); + if (it == m_applets.end()) { + return nullptr; + } + + return it->second; +} + +std::shared_ptr WindowSystem::GetMainApplet() { + std::scoped_lock lk{m_lock}; + + if (m_application) { + return m_applets.at(m_application->aruid.pid); + } + + return nullptr; +} + +void WindowSystem::RequestHomeMenuToGetForeground() { + { + std::scoped_lock lk{m_lock}; + m_foreground_requested_applet = m_home_menu; + } + + m_event_observer->RequestUpdate(); +} + +void WindowSystem::RequestApplicationToGetForeground() { + { + std::scoped_lock lk{m_lock}; + m_foreground_requested_applet = m_application; + } + + m_event_observer->RequestUpdate(); +} + +void WindowSystem::RequestLockHomeMenuIntoForeground() { + { + std::scoped_lock lk{m_lock}; + m_home_menu_foreground_locked = true; + } + + m_event_observer->RequestUpdate(); +} + +void WindowSystem::RequestUnlockHomeMenuIntoForeground() { + { + std::scoped_lock lk{m_lock}; + m_home_menu_foreground_locked = false; + } + + m_event_observer->RequestUpdate(); +} + +void WindowSystem::RequestAppletVisibilityState(Applet& applet, bool visible) { + { + std::scoped_lock lk{applet.lock}; + applet.window_visible = visible; + } + + m_event_observer->RequestUpdate(); +} + +void WindowSystem::OnOperationModeChanged() { + std::scoped_lock lk{m_lock}; + + for (const auto& [aruid, applet] : m_applets) { + std::scoped_lock lk2{applet->lock}; + applet->lifecycle_manager.OnOperationAndPerformanceModeChanged(); + } +} + +void WindowSystem::OnExitRequested() { + std::scoped_lock lk{m_lock}; + + for (const auto& [aruid, applet] : m_applets) { + std::scoped_lock lk2{applet->lock}; + applet->lifecycle_manager.RequestExit(); + } +} + +void WindowSystem::OnHomeButtonPressed(ButtonPressDuration type) { + std::scoped_lock lk{m_lock}; + + // If we don't have a home menu, nothing to do. + if (!m_home_menu) { + return; + } + + // Lock. + std::scoped_lock lk2{m_home_menu->lock}; + + // Send home button press event to home menu. + if (type == ButtonPressDuration::ShortPressing) { + m_home_menu->lifecycle_manager.PushUnorderedMessage( + AppletMessage::DetectShortPressingHomeButton); + } +} + +void WindowSystem::PruneTerminatedAppletsLocked() { + for (auto it = m_applets.begin(); it != m_applets.end(); /* ... */) { + const auto& [aruid, applet] = *it; + + std::scoped_lock lk{applet->lock}; + + if (!applet->process->IsTerminated()) { + // Not terminated. + it = std::next(it); + continue; + } + + // Terminated, so ensure all child applets are terminated. + if (!applet->child_applets.empty()) { + this->TerminateChildAppletsLocked(applet.get()); + + // Not ready to unlink until all child applets are terminated. + it = std::next(it); + continue; + } + + // Erase from caller applet's list of children. + if (auto caller_applet = applet->caller_applet.lock(); caller_applet) { + std::scoped_lock lk2{caller_applet->lock}; + std::erase(caller_applet->child_applets, applet); + applet->caller_applet.reset(); + + // We don't need to update the activity state of the caller applet yet. + // It will be recalculated once we fall out of the termination handling path. + } + + // If this applet was foreground, it no longer is. + if (applet.get() == m_foreground_requested_applet) { + m_foreground_requested_applet = nullptr; + } + + // If this was the home menu, we should clean up. + if (applet.get() == m_home_menu) { + m_home_menu = nullptr; + m_foreground_requested_applet = m_application; + } + + // If this was the application, we should try to switch to the home menu. + if (applet.get() == m_application) { + m_application = nullptr; + m_foreground_requested_applet = m_home_menu; + + // If we have a home menu, send it the application exited message. + if (m_home_menu) { + m_home_menu->lifecycle_manager.PushUnorderedMessage( + AppletMessage::ApplicationExited); + } + } + + // Finalize applet. + applet->OnProcessTerminatedLocked(); + + // Request update to ensure quiescence. + m_event_observer->RequestUpdate(); + + // Unlink and advance. + it = m_applets.erase(it); + } + + // If the last applet has exited, exit the system. + if (m_applets.empty()) { + m_system.Exit(); + } +} + +bool WindowSystem::LockHomeMenuIntoForegroundLocked() { + // If the home menu is not locked into foreground, then there's nothing to do. + if (m_home_menu == nullptr || !m_home_menu_foreground_locked) { + m_home_menu_foreground_locked = false; + return false; + } + + // Terminate any direct child applets of the home menu. + std::scoped_lock lk{m_home_menu->lock}; + + this->TerminateChildAppletsLocked(m_home_menu); + + // When there are zero child applets left, we can proceed with the update. + if (m_home_menu->child_applets.empty()) { + m_home_menu->window_visible = true; + m_foreground_requested_applet = m_home_menu; + return false; + } + + return true; +} + +void WindowSystem::TerminateChildAppletsLocked(Applet* applet) { + auto child_applets = applet->child_applets; + + applet->lock.unlock(); + for (const auto& child_applet : child_applets) { + std::scoped_lock lk{child_applet->lock}; + child_applet->process->Terminate(); + child_applet->terminate_result = AM::ResultLibraryAppletTerminated; + } + applet->lock.lock(); +} + +void WindowSystem::UpdateAppletStateLocked(Applet* applet, bool is_foreground) { + // With no applet, we don't have anything to do. + if (!applet) { + return; + } + + std::scoped_lock lk{applet->lock}; + + const bool inherited_foreground = applet->is_process_running && is_foreground; + const auto visible_state = + inherited_foreground ? ActivityState::ForegroundVisible : ActivityState::BackgroundVisible; + const auto obscured_state = inherited_foreground ? ActivityState::ForegroundObscured + : ActivityState::BackgroundObscured; + + const bool has_obscuring_child_applets = [&] { + for (const auto& child_applet : applet->child_applets) { + std::scoped_lock lk2{child_applet->lock}; + const auto mode = child_applet->library_applet_mode; + if (child_applet->is_process_running && child_applet->window_visible && + (mode == LibraryAppletMode::AllForeground || + mode == LibraryAppletMode::AllForegroundInitiallyHidden)) { + return true; + } + } + + return false; + }(); + + // Update visibility state. + applet->display_layer_manager.SetWindowVisibility(is_foreground && applet->window_visible); + + // Update interactibility state. + applet->SetInteractibleLocked(is_foreground && applet->window_visible); + + // Update focus state and suspension. + const bool is_obscured = has_obscuring_child_applets || !applet->window_visible; + const auto state = applet->lifecycle_manager.GetActivityState(); + + if (is_obscured && state != obscured_state) { + // Set obscured state. + applet->lifecycle_manager.SetActivityState(obscured_state); + applet->UpdateSuspensionStateLocked(true); + } else if (!is_obscured && state != visible_state) { + // Set visible state. + applet->lifecycle_manager.SetActivityState(visible_state); + applet->UpdateSuspensionStateLocked(true); + } + + // Recurse into child applets. + for (const auto& child_applet : applet->child_applets) { + this->UpdateAppletStateLocked(child_applet.get(), is_foreground); + } +} + +} // namespace Service::AM diff --git a/src/core/hle/service/am/window_system.h b/src/core/hle/service/am/window_system.h new file mode 100644 index 000000000..69e7a27ba --- /dev/null +++ b/src/core/hle/service/am/window_system.h @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include + +#include "common/common_types.h" + +namespace Core { +class System; +} + +namespace Service::AM { + +struct Applet; +class EventObserver; + +enum class ButtonPressDuration { + ShortPressing, + MiddlePressing, + LongPressing, +}; + +class WindowSystem { +public: + explicit WindowSystem(Core::System& system); + ~WindowSystem(); + +public: + void SetEventObserver(EventObserver* event_observer); + void Update(); + +public: + void TrackApplet(std::shared_ptr applet, bool is_application); + std::shared_ptr GetByAppletResourceUserId(u64 aruid); + std::shared_ptr GetMainApplet(); + +public: + void RequestHomeMenuToGetForeground(); + void RequestApplicationToGetForeground(); + void RequestLockHomeMenuIntoForeground(); + void RequestUnlockHomeMenuIntoForeground(); + void RequestAppletVisibilityState(Applet& applet, bool visible); + +public: + void OnOperationModeChanged(); + void OnExitRequested(); + void OnHomeButtonPressed(ButtonPressDuration type); + void OnCaptureButtonPressed(ButtonPressDuration type) {} + void OnPowerButtonPressed(ButtonPressDuration type) {} + +private: + void PruneTerminatedAppletsLocked(); + bool LockHomeMenuIntoForegroundLocked(); + void TerminateChildAppletsLocked(Applet* applet); + void UpdateAppletStateLocked(Applet* applet, bool is_foreground); + +private: + // System reference. + Core::System& m_system; + + // Event observer. + EventObserver* m_event_observer{}; + + // Lock. + std::mutex m_lock{}; + + // Home menu state. + bool m_home_menu_foreground_locked{}; + Applet* m_foreground_requested_applet{}; + + // Foreground roots. + Applet* m_home_menu{}; + Applet* m_application{}; + + // Applet map by aruid. + std::map> m_applets{}; +}; + +} // namespace Service::AM diff --git a/src/core/hle/service/glue/time/manager.cpp b/src/core/hle/service/glue/time/manager.cpp index cb88486dd..77bf8896c 100644 --- a/src/core/hle/service/glue/time/manager.cpp +++ b/src/core/hle/service/glue/time/manager.cpp @@ -11,7 +11,6 @@ #include "core/file_sys/vfs/vfs.h" #include "core/hle/kernel/svc.h" #include "core/hle/service/glue/time/manager.h" -#include "core/hle/service/glue/time/time_zone_binary.h" #include "core/hle/service/psc/time/service_manager.h" #include "core/hle/service/psc/time/static.h" #include "core/hle/service/psc/time/system_clock.h" @@ -20,8 +19,8 @@ #include "core/hle/service/sm/sm.h" namespace Service::Glue::Time { -namespace { -s64 CalendarTimeToEpoch(Service::PSC::Time::CalendarTime calendar) { + +static s64 CalendarTimeToEpoch(Service::PSC::Time::CalendarTime calendar) { constexpr auto is_leap = [](s32 year) -> bool { return (((year) % 4) == 0 && (((year) % 100) != 0 || ((year) % 400) == 0)); }; @@ -50,7 +49,8 @@ s64 CalendarTimeToEpoch(Service::PSC::Time::CalendarTime calendar) { return epoch_s - 62135683200ll; } -s64 GetEpochTimeFromInitialYear(std::shared_ptr& set_sys) { +static s64 GetEpochTimeFromInitialYear( + std::shared_ptr& set_sys) { s32 year{2000}; set_sys->GetSettingsItemValueImpl(year, "time", "standard_user_clock_initial_year"); @@ -65,31 +65,32 @@ s64 GetEpochTimeFromInitialYear(std::shared_ptr("time:m", true); @@ -99,7 +100,7 @@ TimeManager::TimeManager(Core::System& system) m_set_sys = system.ServiceManager().GetService("set:sys", true); - res = MountTimeZoneBinary(system); + res = m_time_zone_binary.Mount(); ASSERT(res == ResultSuccess); m_worker.Initialize(m_time_sm, m_set_sys); @@ -187,10 +188,6 @@ TimeManager::TimeManager(Core::System& system) } } -TimeManager::~TimeManager() { - ResetTimeZoneBinary(); -} - Result TimeManager::SetupStandardSteadyClockCore() { Common::UUID external_clock_source_id{}; auto res = m_set_sys->GetExternalSteadyClockSourceId(&external_clock_source_id); @@ -236,7 +233,7 @@ Result TimeManager::SetupTimeZoneServiceCore() { auto res = m_set_sys->GetDeviceTimeZoneLocationName(&name); ASSERT(res == ResultSuccess); - auto configured_zone = GetTimeZoneString(name); + auto configured_zone = GetTimeZoneString(m_time_zone_binary, name); if (configured_zone != name) { m_set_sys->SetDeviceTimeZoneLocationName(configured_zone); @@ -254,13 +251,13 @@ Result TimeManager::SetupTimeZoneServiceCore() { res = m_set_sys->GetDeviceTimeZoneLocationUpdatedTime(&time_point); ASSERT(res == ResultSuccess); - auto location_count = GetTimeZoneCount(); + auto location_count = m_time_zone_binary.GetTimeZoneCount(); Service::PSC::Time::RuleVersion rule_version{}; - GetTimeZoneVersion(rule_version); + m_time_zone_binary.GetTimeZoneVersion(rule_version); std::span rule_buffer{}; size_t rule_size{}; - res = GetTimeZoneRule(rule_buffer, rule_size, name); + res = m_time_zone_binary.GetTimeZoneRule(rule_buffer, rule_size, name); ASSERT(res == ResultSuccess); res = m_time_m->SetupTimeZoneServiceCore(name, rule_version, location_count, time_point, diff --git a/src/core/hle/service/glue/time/manager.h b/src/core/hle/service/glue/time/manager.h index bb4b65049..87e6a21fd 100644 --- a/src/core/hle/service/glue/time/manager.h +++ b/src/core/hle/service/glue/time/manager.h @@ -10,6 +10,7 @@ #include "core/file_sys/vfs/vfs_types.h" #include "core/hle/service/glue/time/file_timestamp_worker.h" #include "core/hle/service/glue/time/standard_steady_clock_resource.h" +#include "core/hle/service/glue/time/time_zone_binary.h" #include "core/hle/service/glue/time/worker.h" #include "core/hle/service/service.h" @@ -26,7 +27,7 @@ namespace Service::Glue::Time { class TimeManager { public: explicit TimeManager(Core::System& system); - ~TimeManager(); + ~TimeManager() = default; std::shared_ptr m_set_sys; @@ -34,6 +35,7 @@ public: std::shared_ptr m_time_sm{}; StandardSteadyClockResource m_steady_clock_resource; FileTimestampWorker m_file_timestamp_worker; + TimeZoneBinary m_time_zone_binary; TimeWorker m_worker; private: diff --git a/src/core/hle/service/glue/time/static.cpp b/src/core/hle/service/glue/time/static.cpp index b801faef2..d8672b724 100644 --- a/src/core/hle/service/glue/time/static.cpp +++ b/src/core/hle/service/glue/time/static.cpp @@ -26,8 +26,9 @@ StaticService::StaticService(Core::System& system_, std::shared_ptr time, const char* name) : ServiceFramework{system_, name}, m_system{system_}, m_time_m{time->m_time_m}, m_setup_info{setup_info}, m_time_sm{time->m_time_sm}, - m_file_timestamp_worker{time->m_file_timestamp_worker}, m_standard_steady_clock_resource{ - time->m_steady_clock_resource} { + m_file_timestamp_worker{time->m_file_timestamp_worker}, + m_standard_steady_clock_resource{time->m_steady_clock_resource}, + m_time_zone_binary{time->m_time_zone_binary} { // clang-format off static const FunctionInfo functions[] = { {0, D<&StaticService::GetStandardUserSystemClock>, "GetStandardUserSystemClock"}, @@ -106,7 +107,7 @@ Result StaticService::GetTimeZoneService(OutInterface out_servi *out_service = std::make_shared( m_system, m_file_timestamp_worker, m_setup_info.can_write_timezone_device_location, - m_time_zone); + m_time_zone_binary, m_time_zone); R_SUCCEED(); } diff --git a/src/core/hle/service/glue/time/static.h b/src/core/hle/service/glue/time/static.h index 5d3623182..7bb0e0b36 100644 --- a/src/core/hle/service/glue/time/static.h +++ b/src/core/hle/service/glue/time/static.h @@ -80,5 +80,6 @@ private: std::shared_ptr m_time_zone; FileTimestampWorker& m_file_timestamp_worker; StandardSteadyClockResource& m_standard_steady_clock_resource; + TimeZoneBinary& m_time_zone_binary; }; } // namespace Service::Glue::Time diff --git a/src/core/hle/service/glue/time/time_zone.cpp b/src/core/hle/service/glue/time/time_zone.cpp index f4d0c87d5..b2e815965 100644 --- a/src/core/hle/service/glue/time/time_zone.cpp +++ b/src/core/hle/service/glue/time/time_zone.cpp @@ -15,19 +15,16 @@ #include "core/hle/service/sm/sm.h" namespace Service::Glue::Time { -namespace { -static std::mutex g_list_mutex; -static Common::IntrusiveListBaseTraits::ListType g_list_nodes{}; -} // namespace TimeZoneService::TimeZoneService( Core::System& system_, FileTimestampWorker& file_timestamp_worker, - bool can_write_timezone_device_location, + bool can_write_timezone_device_location, TimeZoneBinary& time_zone_binary, std::shared_ptr time_zone_service) : ServiceFramework{system_, "ITimeZoneService"}, m_system{system}, m_can_write_timezone_device_location{can_write_timezone_device_location}, - m_file_timestamp_worker{file_timestamp_worker}, - m_wrapped_service{std::move(time_zone_service)}, m_operation_event{m_system} { + m_file_timestamp_worker{file_timestamp_worker}, m_wrapped_service{std::move( + time_zone_service)}, + m_operation_event{m_system}, m_time_zone_binary{time_zone_binary} { // clang-format off static const FunctionInfo functions[] = { {0, D<&TimeZoneService::GetDeviceLocationName>, "GetDeviceLocationName"}, @@ -48,7 +45,6 @@ TimeZoneService::TimeZoneService( // clang-format on RegisterHandlers(functions); - g_list_nodes.clear(); m_set_sys = m_system.ServiceManager().GetService("set:sys", true); } @@ -69,13 +65,13 @@ Result TimeZoneService::SetDeviceLocationName( LOG_DEBUG(Service_Time, "called. location_name={}", location_name); R_UNLESS(m_can_write_timezone_device_location, Service::PSC::Time::ResultPermissionDenied); - R_UNLESS(IsTimeZoneBinaryValid(location_name), Service::PSC::Time::ResultTimeZoneNotFound); + R_UNLESS(m_time_zone_binary.IsValid(location_name), Service::PSC::Time::ResultTimeZoneNotFound); std::scoped_lock l{m_mutex}; std::span binary{}; size_t binary_size{}; - R_TRY(GetTimeZoneRule(binary, binary_size, location_name)) + R_TRY(m_time_zone_binary.GetTimeZoneRule(binary, binary_size, location_name)) R_TRY(m_wrapped_service->SetDeviceLocationNameWithTimeZoneRule(location_name, binary)); @@ -88,8 +84,8 @@ Result TimeZoneService::SetDeviceLocationName( m_set_sys->SetDeviceTimeZoneLocationName(name); m_set_sys->SetDeviceTimeZoneLocationUpdatedTime(time_point); - std::scoped_lock m{g_list_mutex}; - for (auto& operation_event : g_list_nodes) { + std::scoped_lock m{m_list_mutex}; + for (auto& operation_event : m_list_nodes) { operation_event.m_event->Signal(); } R_SUCCEED(); @@ -112,7 +108,8 @@ Result TimeZoneService::LoadLocationNameList( }; std::scoped_lock l{m_mutex}; - R_RETURN(GetTimeZoneLocationList(*out_count, out_names, out_names.size(), index)); + R_RETURN( + m_time_zone_binary.GetTimeZoneLocationList(*out_count, out_names, out_names.size(), index)); } Result TimeZoneService::LoadTimeZoneRule(OutRule out_rule, @@ -122,7 +119,7 @@ Result TimeZoneService::LoadTimeZoneRule(OutRule out_rule, std::scoped_lock l{m_mutex}; std::span binary{}; size_t binary_size{}; - R_TRY(GetTimeZoneRule(binary, binary_size, name)) + R_TRY(m_time_zone_binary.GetTimeZoneRule(binary, binary_size, name)) R_RETURN(m_wrapped_service->ParseTimeZoneBinary(out_rule, binary)); } @@ -174,7 +171,7 @@ Result TimeZoneService::GetDeviceLocationNameOperationEventReadableHandle( m_operation_event.m_ctx.CreateEvent("Psc:TimeZoneService:OperationEvent"); operation_event_initialized = true; std::scoped_lock l{m_mutex}; - g_list_nodes.push_back(m_operation_event); + m_list_nodes.push_back(m_operation_event); } *out_event = &m_operation_event.m_event->GetReadableEvent(); diff --git a/src/core/hle/service/glue/time/time_zone.h b/src/core/hle/service/glue/time/time_zone.h index beb54ddde..2b130b275 100644 --- a/src/core/hle/service/glue/time/time_zone.h +++ b/src/core/hle/service/glue/time/time_zone.h @@ -32,6 +32,7 @@ class TimeZoneService; namespace Service::Glue::Time { class FileTimestampWorker; +class TimeZoneBinary; class TimeZoneService final : public ServiceFramework { using InRule = InLargeData; @@ -40,7 +41,7 @@ class TimeZoneService final : public ServiceFramework { public: explicit TimeZoneService( Core::System& system, FileTimestampWorker& file_timestamp_worker, - bool can_write_timezone_device_location, + bool can_write_timezone_device_location, TimeZoneBinary& time_zone_binary, std::shared_ptr time_zone_service); ~TimeZoneService() override; @@ -85,6 +86,10 @@ private: std::mutex m_mutex; bool operation_event_initialized{}; Service::PSC::Time::OperationEvent m_operation_event; + TimeZoneBinary& m_time_zone_binary; + + std::mutex m_list_mutex; + Common::IntrusiveListBaseTraits::ListType m_list_nodes{}; }; } // namespace Service::Glue::Time diff --git a/src/core/hle/service/glue/time/time_zone_binary.cpp b/src/core/hle/service/glue/time/time_zone_binary.cpp index 18c6abd6b..3cc1a62af 100644 --- a/src/core/hle/service/glue/time/time_zone_binary.cpp +++ b/src/core/hle/service/glue/time/time_zone_binary.cpp @@ -12,18 +12,58 @@ #include "core/hle/service/glue/time/time_zone_binary.h" namespace Service::Glue::Time { -namespace { constexpr u64 TimeZoneBinaryId = 0x10000000000080E; -static FileSys::VirtualDir g_time_zone_binary_romfs{}; -static Result g_time_zone_binary_mount_result{ResultUnknown}; -static std::vector g_time_zone_scratch_space(0x2800, 0); +void TimeZoneBinary::Reset() { + time_zone_binary_romfs = {}; + time_zone_binary_mount_result = ResultUnknown; + time_zone_scratch_space.clear(); + time_zone_scratch_space.resize(0x2800, 0); +} -Result TimeZoneReadBinary(size_t& out_read_size, std::span out_buffer, size_t out_buffer_size, - std::string_view path) { - R_UNLESS(g_time_zone_binary_mount_result == ResultSuccess, g_time_zone_binary_mount_result); +Result TimeZoneBinary::Mount() { + Reset(); - auto vfs_file{g_time_zone_binary_romfs->GetFileRelative(path)}; + auto& fsc{system.GetFileSystemController()}; + std::unique_ptr nca{}; + + auto* bis_system = fsc.GetSystemNANDContents(); + + R_UNLESS(bis_system, ResultUnknown); + + nca = bis_system->GetEntry(TimeZoneBinaryId, FileSys::ContentRecordType::Data); + + if (nca) { + time_zone_binary_romfs = FileSys::ExtractRomFS(nca->GetRomFS()); + } + + if (time_zone_binary_romfs) { + // Validate that the romfs is readable, using invalid firmware keys can cause this to get + // set but the files to be garbage. In that case, we want to hit the next path and + // synthesise them instead. + time_zone_binary_mount_result = ResultSuccess; + Service::PSC::Time::LocationName name{"Etc/GMT"}; + if (!IsValid(name)) { + Reset(); + } + } + + if (!time_zone_binary_romfs) { + time_zone_binary_romfs = FileSys::ExtractRomFS( + FileSys::SystemArchive::SynthesizeSystemArchive(TimeZoneBinaryId)); + } + + R_UNLESS(time_zone_binary_romfs, ResultUnknown); + + time_zone_binary_mount_result = ResultSuccess; + R_SUCCEED(); +} + +Result TimeZoneBinary::Read(size_t& out_read_size, std::span out_buffer, size_t out_buffer_size, + std::string_view path) { + R_UNLESS(time_zone_binary_mount_result == ResultSuccess, time_zone_binary_mount_result); + + auto vfs_file{time_zone_binary_romfs->GetFileRelative(path)}; R_UNLESS(vfs_file, ResultUnknown); auto file_size{vfs_file->GetSize()}; @@ -36,82 +76,37 @@ Result TimeZoneReadBinary(size_t& out_read_size, std::span out_buffer, size_ R_SUCCEED(); } -} // namespace -void ResetTimeZoneBinary() { - g_time_zone_binary_romfs = {}; - g_time_zone_binary_mount_result = ResultUnknown; - g_time_zone_scratch_space.clear(); - g_time_zone_scratch_space.resize(0x2800, 0); -} - -Result MountTimeZoneBinary(Core::System& system) { - ResetTimeZoneBinary(); - - auto& fsc{system.GetFileSystemController()}; - std::unique_ptr nca{}; - - auto* bis_system = fsc.GetSystemNANDContents(); - - R_UNLESS(bis_system, ResultUnknown); - - nca = bis_system->GetEntry(TimeZoneBinaryId, FileSys::ContentRecordType::Data); - - if (nca) { - g_time_zone_binary_romfs = FileSys::ExtractRomFS(nca->GetRomFS()); - } - - if (g_time_zone_binary_romfs) { - // Validate that the romfs is readable, using invalid firmware keys can cause this to get - // set but the files to be garbage. In that case, we want to hit the next path and - // synthesise them instead. - g_time_zone_binary_mount_result = ResultSuccess; - Service::PSC::Time::LocationName name{"Etc/GMT"}; - if (!IsTimeZoneBinaryValid(name)) { - ResetTimeZoneBinary(); - } - } - - if (!g_time_zone_binary_romfs) { - g_time_zone_binary_romfs = FileSys::ExtractRomFS( - FileSys::SystemArchive::SynthesizeSystemArchive(TimeZoneBinaryId)); - } - - R_UNLESS(g_time_zone_binary_romfs, ResultUnknown); - - g_time_zone_binary_mount_result = ResultSuccess; - R_SUCCEED(); -} - -void GetTimeZoneBinaryListPath(std::string& out_path) { - if (g_time_zone_binary_mount_result != ResultSuccess) { +void TimeZoneBinary::GetListPath(std::string& out_path) { + if (time_zone_binary_mount_result != ResultSuccess) { return; } // out_path = fmt::format("{}:/binaryList.txt", "TimeZoneBinary"); out_path = "/binaryList.txt"; } -void GetTimeZoneBinaryVersionPath(std::string& out_path) { - if (g_time_zone_binary_mount_result != ResultSuccess) { +void TimeZoneBinary::GetVersionPath(std::string& out_path) { + if (time_zone_binary_mount_result != ResultSuccess) { return; } // out_path = fmt::format("{}:/version.txt", "TimeZoneBinary"); out_path = "/version.txt"; } -void GetTimeZoneZonePath(std::string& out_path, const Service::PSC::Time::LocationName& name) { - if (g_time_zone_binary_mount_result != ResultSuccess) { +void TimeZoneBinary::GetTimeZonePath(std::string& out_path, + const Service::PSC::Time::LocationName& name) { + if (time_zone_binary_mount_result != ResultSuccess) { return; } // out_path = fmt::format("{}:/zoneinfo/{}", "TimeZoneBinary", name); out_path = fmt::format("/zoneinfo/{}", name.data()); } -bool IsTimeZoneBinaryValid(const Service::PSC::Time::LocationName& name) { +bool TimeZoneBinary::IsValid(const Service::PSC::Time::LocationName& name) { std::string path{}; - GetTimeZoneZonePath(path, name); + GetTimeZonePath(path, name); - auto vfs_file{g_time_zone_binary_romfs->GetFileRelative(path)}; + auto vfs_file{time_zone_binary_romfs->GetFileRelative(path)}; if (!vfs_file) { LOG_INFO(Service_Time, "Could not find timezone file {}", path); return false; @@ -119,19 +114,19 @@ bool IsTimeZoneBinaryValid(const Service::PSC::Time::LocationName& name) { return vfs_file->GetSize() != 0; } -u32 GetTimeZoneCount() { +u32 TimeZoneBinary::GetTimeZoneCount() { std::string path{}; - GetTimeZoneBinaryListPath(path); + GetListPath(path); size_t bytes_read{}; - if (TimeZoneReadBinary(bytes_read, g_time_zone_scratch_space, 0x2800, path) != ResultSuccess) { + if (Read(bytes_read, time_zone_scratch_space, 0x2800, path) != ResultSuccess) { return 0; } if (bytes_read == 0) { return 0; } - auto chars = std::span(reinterpret_cast(g_time_zone_scratch_space.data()), bytes_read); + auto chars = std::span(reinterpret_cast(time_zone_scratch_space.data()), bytes_read); u32 count{}; for (auto chr : chars) { if (chr == '\n') { @@ -141,50 +136,47 @@ u32 GetTimeZoneCount() { return count; } -Result GetTimeZoneVersion(Service::PSC::Time::RuleVersion& out_rule_version) { +Result TimeZoneBinary::GetTimeZoneVersion(Service::PSC::Time::RuleVersion& out_rule_version) { std::string path{}; - GetTimeZoneBinaryVersionPath(path); + GetVersionPath(path); auto rule_version_buffer{std::span(reinterpret_cast(&out_rule_version), sizeof(Service::PSC::Time::RuleVersion))}; size_t bytes_read{}; - R_TRY(TimeZoneReadBinary(bytes_read, rule_version_buffer, rule_version_buffer.size_bytes(), - path)); + R_TRY(Read(bytes_read, rule_version_buffer, rule_version_buffer.size_bytes(), path)); rule_version_buffer[bytes_read] = 0; R_SUCCEED(); } -Result GetTimeZoneRule(std::span& out_rule, size_t& out_rule_size, - const Service::PSC::Time::LocationName& name) { +Result TimeZoneBinary::GetTimeZoneRule(std::span& out_rule, size_t& out_rule_size, + const Service::PSC::Time::LocationName& name) { std::string path{}; - GetTimeZoneZonePath(path, name); + GetTimeZonePath(path, name); size_t bytes_read{}; - R_TRY(TimeZoneReadBinary(bytes_read, g_time_zone_scratch_space, - g_time_zone_scratch_space.size(), path)); + R_TRY(Read(bytes_read, time_zone_scratch_space, time_zone_scratch_space.size(), path)); - out_rule = std::span(g_time_zone_scratch_space.data(), bytes_read); + out_rule = std::span(time_zone_scratch_space.data(), bytes_read); out_rule_size = bytes_read; R_SUCCEED(); } -Result GetTimeZoneLocationList(u32& out_count, - std::span out_names, - size_t max_names, u32 index) { +Result TimeZoneBinary::GetTimeZoneLocationList( + u32& out_count, std::span out_names, size_t max_names, + u32 index) { std::string path{}; - GetTimeZoneBinaryListPath(path); + GetListPath(path); size_t bytes_read{}; - R_TRY(TimeZoneReadBinary(bytes_read, g_time_zone_scratch_space, - g_time_zone_scratch_space.size(), path)); + R_TRY(Read(bytes_read, time_zone_scratch_space, time_zone_scratch_space.size(), path)); out_count = 0; R_SUCCEED_IF(bytes_read == 0); Service::PSC::Time::LocationName current_name{}; size_t current_name_len{}; - std::span chars{g_time_zone_scratch_space}; + std::span chars{time_zone_scratch_space}; u32 name_count{}; for (auto chr : chars) { diff --git a/src/core/hle/service/glue/time/time_zone_binary.h b/src/core/hle/service/glue/time/time_zone_binary.h index 9d0a8dfe9..135da631f 100644 --- a/src/core/hle/service/glue/time/time_zone_binary.h +++ b/src/core/hle/service/glue/time/time_zone_binary.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "core/hle/service/psc/time/common.h" @@ -15,18 +16,34 @@ class System; namespace Service::Glue::Time { -void ResetTimeZoneBinary(); -Result MountTimeZoneBinary(Core::System& system); -void GetTimeZoneBinaryListPath(std::string& out_path); -void GetTimeZoneBinaryVersionPath(std::string& out_path); -void GetTimeZoneZonePath(std::string& out_path, const Service::PSC::Time::LocationName& name); -bool IsTimeZoneBinaryValid(const Service::PSC::Time::LocationName& name); -u32 GetTimeZoneCount(); -Result GetTimeZoneVersion(Service::PSC::Time::RuleVersion& out_rule_version); -Result GetTimeZoneRule(std::span& out_rule, size_t& out_rule_size, - const Service::PSC::Time::LocationName& name); -Result GetTimeZoneLocationList(u32& out_count, - std::span out_names, - size_t max_names, u32 index); +class TimeZoneBinary { +public: + explicit TimeZoneBinary(Core::System& system_) + : time_zone_scratch_space(0x2800, 0), system{system_} {} + + Result Mount(); + bool IsValid(const Service::PSC::Time::LocationName& name); + u32 GetTimeZoneCount(); + Result GetTimeZoneVersion(Service::PSC::Time::RuleVersion& out_rule_version); + Result GetTimeZoneRule(std::span& out_rule, size_t& out_rule_size, + const Service::PSC::Time::LocationName& name); + Result GetTimeZoneLocationList(u32& out_count, + std::span out_names, + size_t max_names, u32 index); + +private: + void Reset(); + Result Read(size_t& out_read_size, std::span out_buffer, size_t out_buffer_size, + std::string_view path); + void GetListPath(std::string& out_path); + void GetVersionPath(std::string& out_path); + void GetTimeZonePath(std::string& out_path, const Service::PSC::Time::LocationName& name); + + FileSys::VirtualDir time_zone_binary_romfs{}; + Result time_zone_binary_mount_result{ResultUnknown}; + std::vector time_zone_scratch_space; + + Core::System& system; +}; } // namespace Service::Glue::Time diff --git a/src/core/hle/service/glue/time/worker.cpp b/src/core/hle/service/glue/time/worker.cpp index b6bbd7965..1dab3e9dc 100644 --- a/src/core/hle/service/glue/time/worker.cpp +++ b/src/core/hle/service/glue/time/worker.cpp @@ -16,23 +16,6 @@ #include "core/hle/service/sm/sm.h" namespace Service::Glue::Time { -namespace { - -bool g_ig_report_network_clock_context_set{}; -Service::PSC::Time::SystemClockContext g_report_network_clock_context{}; -bool g_ig_report_ephemeral_clock_context_set{}; -Service::PSC::Time::SystemClockContext g_report_ephemeral_clock_context{}; - -template -T GetSettingsItemValue(std::shared_ptr& set_sys, - const char* category, const char* name) { - T v{}; - auto res = set_sys->GetSettingsItemValueImpl(v, category, name); - ASSERT(res == ResultSuccess); - return v; -} - -} // namespace TimeWorker::TimeWorker(Core::System& system, StandardSteadyClockResource& steady_clock_resource, FileTimestampWorker& file_timestamp_worker) @@ -43,11 +26,6 @@ TimeWorker::TimeWorker(Core::System& system, StandardSteadyClockResource& steady "Glue:TimeWorker:SteadyClockTimerEvent")}, m_timer_file_system{m_ctx.CreateEvent("Glue:TimeWorker:FileTimeTimerEvent")}, m_alarm_worker{m_system, m_steady_clock_resource}, m_pm_state_change_handler{m_alarm_worker} { - g_ig_report_network_clock_context_set = false; - g_report_network_clock_context = {}; - g_ig_report_ephemeral_clock_context_set = false; - g_report_ephemeral_clock_context = {}; - m_timer_steady_clock_timing_event = Core::Timing::CreateEvent( "Time::SteadyClockEvent", [this](s64 time, @@ -82,6 +60,14 @@ TimeWorker::~TimeWorker() { m_ctx.CloseEvent(m_timer_file_system); } +template +T TimeWorker::GetSettingsItemValue(const std::string& category, const std::string& name) { + T v{}; + auto res = m_set_sys->GetSettingsItemValueImpl(v, category, name); + ASSERT(res == ResultSuccess); + return v; +} + void TimeWorker::Initialize(std::shared_ptr time_sm, std::shared_ptr set_sys) { m_set_sys = std::move(set_sys); @@ -91,8 +77,8 @@ void TimeWorker::Initialize(std::shared_ptr t m_alarm_worker.Initialize(m_time_m); - auto steady_clock_interval_m = GetSettingsItemValue( - m_set_sys, "time", "standard_steady_clock_rtc_update_interval_minutes"); + auto steady_clock_interval_m = + GetSettingsItemValue("time", "standard_steady_clock_rtc_update_interval_minutes"); auto one_minute_ns{ std::chrono::duration_cast(std::chrono::minutes(1)).count()}; @@ -102,8 +88,7 @@ void TimeWorker::Initialize(std::shared_ptr t std::chrono::nanoseconds(steady_clock_interval_ns), m_timer_steady_clock_timing_event); - auto fs_notify_time_s = - GetSettingsItemValue(m_set_sys, "time", "notify_time_to_fs_interval_seconds"); + auto fs_notify_time_s = GetSettingsItemValue("time", "notify_time_to_fs_interval_seconds"); auto one_second_ns{ std::chrono::duration_cast(std::chrono::seconds(1)).count()}; s64 fs_notify_time_ns{fs_notify_time_s * one_second_ns}; @@ -218,14 +203,14 @@ void TimeWorker::ThreadFunc(std::stop_token stop_token) { } [[maybe_unused]] auto offset_before{ - g_ig_report_network_clock_context_set ? g_report_network_clock_context.offset : 0}; + m_ig_report_network_clock_context_set ? m_report_network_clock_context.offset : 0}; // TODO system report "standard_netclock_operation" // "clock_time" = time // "context_offset_before" = offset_before // "context_offset_after" = context.offset - g_report_network_clock_context = context; - if (!g_ig_report_network_clock_context_set) { - g_ig_report_network_clock_context_set = true; + m_report_network_clock_context = context; + if (!m_ig_report_network_clock_context_set) { + m_ig_report_network_clock_context_set = true; } m_file_timestamp_worker.SetFilesystemPosixTime(); @@ -247,16 +232,16 @@ void TimeWorker::ThreadFunc(std::stop_token stop_token) { break; } - [[maybe_unused]] auto offset_before{g_ig_report_ephemeral_clock_context_set - ? g_report_ephemeral_clock_context.offset + [[maybe_unused]] auto offset_before{m_ig_report_ephemeral_clock_context_set + ? m_report_ephemeral_clock_context.offset : 0}; // TODO system report "ephemeral_netclock_operation" // "clock_time" = time // "context_offset_before" = offset_before // "context_offset_after" = context.offset - g_report_ephemeral_clock_context = context; - if (!g_ig_report_ephemeral_clock_context_set) { - g_ig_report_ephemeral_clock_context_set = true; + m_report_ephemeral_clock_context = context; + if (!m_ig_report_ephemeral_clock_context_set) { + m_ig_report_ephemeral_clock_context_set = true; } break; } diff --git a/src/core/hle/service/glue/time/worker.h b/src/core/hle/service/glue/time/worker.h index 75e5c4d0f..69904e674 100644 --- a/src/core/hle/service/glue/time/worker.h +++ b/src/core/hle/service/glue/time/worker.h @@ -34,6 +34,9 @@ public: void StartThread(); private: + template + T GetSettingsItemValue(const std::string& category, const std::string& name); + void ThreadFunc(std::stop_token stop_token); Core::System& m_system; @@ -59,6 +62,11 @@ private: std::shared_ptr m_timer_file_system_timing_event; AlarmWorker m_alarm_worker; PmStateChangeHandler m_pm_state_change_handler; + + bool m_ig_report_network_clock_context_set{}; + Service::PSC::Time::SystemClockContext m_report_network_clock_context{}; + bool m_ig_report_ephemeral_clock_context_set{}; + Service::PSC::Time::SystemClockContext m_report_ephemeral_clock_context{}; }; } // namespace Service::Glue::Time diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index 1fa9cfbfb..47eedd73b 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp @@ -23,11 +23,7 @@ void LoopProcess(Core::System& system) { std::shared_ptr resource_manager = std::make_shared(system, firmware_settings); - // TODO: Remove this hack when am is emulated properly. resource_manager->Initialize(); - resource_manager->RegisterAppletResourceUserId(system.ApplicationProcess()->GetProcessId(), - true); - resource_manager->SetAruidValidForVibration(system.ApplicationProcess()->GetProcessId(), true); server_manager->RegisterNamedService( "hid", std::make_shared(system, resource_manager, firmware_settings)); diff --git a/src/core/hle/service/nvdrv/core/container.h b/src/core/hle/service/nvdrv/core/container.h index f159ced09..cf549d7f3 100644 --- a/src/core/hle/service/nvdrv/core/container.h +++ b/src/core/hle/service/nvdrv/core/container.h @@ -68,10 +68,7 @@ public: const SyncpointManager& GetSyncpointManager() const; struct Host1xDeviceFileData { - std::unordered_map fd_to_id{}; std::deque syncpts_accumulated{}; - u32 nvdec_next_id{}; - u32 vic_next_id{}; }; Host1xDeviceFileData& Host1xDeviceFile(); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index 2c0ac2a46..60b89b628 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp @@ -8,6 +8,7 @@ #include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/devices/ioctl_serialization.h" #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h" +#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace Service::Nvidia::Devices { @@ -21,13 +22,8 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span in switch (command.group) { case 0x0: switch (command.cmd) { - case 0x1: { - auto& host1x_file = core.Host1xDeviceFile(); - if (!host1x_file.fd_to_id.contains(fd)) { - host1x_file.fd_to_id[fd] = host1x_file.nvdec_next_id++; - } + case 0x1: return WrapFixedVariable(this, &nvhost_nvdec::Submit, input, output, fd); - } case 0x2: return WrapFixed(this, &nvhost_nvdec::GetSyncpoint, input, output); case 0x3: @@ -72,15 +68,12 @@ void nvhost_nvdec::OnOpen(NvCore::SessionId session_id, DeviceFD fd) { LOG_INFO(Service_NVDRV, "NVDEC video stream started"); system.SetNVDECActive(true); sessions[fd] = session_id; + host1x.StartDevice(fd, Tegra::Host1x::ChannelType::NvDec, channel_syncpoint); } void nvhost_nvdec::OnClose(DeviceFD fd) { LOG_INFO(Service_NVDRV, "NVDEC video stream ended"); - auto& host1x_file = core.Host1xDeviceFile(); - const auto iter = host1x_file.fd_to_id.find(fd); - if (iter != host1x_file.fd_to_id.end()) { - system.GPU().ClearCdmaInstance(iter->second); - } + host1x.StopDevice(fd, Tegra::Host1x::ChannelType::NvDec); system.SetNVDECActive(false); auto it = sessions.find(fd); if (it != sessions.end()) { diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp index a0a7bfa40..9ca6308e6 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp @@ -55,8 +55,9 @@ std::size_t WriteVectors(std::span dst, const std::vector& src, std::size nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_, NvCore::ChannelType channel_type_) - : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()}, - nvmap{core.GetNvMapFile()}, channel_type{channel_type_} { + : nvdevice{system_}, host1x{system_.Host1x()}, core{core_}, + syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()}, + channel_type{channel_type_} { auto& syncpts_accumulated = core.Host1xDeviceFile().syncpts_accumulated; if (syncpts_accumulated.empty()) { channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false); @@ -95,24 +96,24 @@ NvResult nvhost_nvdec_common::Submit(IoctlSubmit& params, std::span data, De offset += SliceVectors(data, syncpt_increments, params.syncpoint_count, offset); offset += SliceVectors(data, fence_thresholds, params.fence_count, offset); - auto& gpu = system.GPU(); auto* session = core.GetSession(sessions[fd]); - if (gpu.UseNvdec()) { - for (std::size_t i = 0; i < syncpt_increments.size(); i++) { - const SyncptIncr& syncpt_incr = syncpt_increments[i]; - fence_thresholds[i] = - syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments); - } + for (std::size_t i = 0; i < syncpt_increments.size(); i++) { + const SyncptIncr& syncpt_incr = syncpt_increments[i]; + fence_thresholds[i] = + syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments); } + for (const auto& cmd_buffer : command_buffers) { const auto object = nvmap.GetHandle(cmd_buffer.memory_id); ASSERT_OR_EXECUTE(object, return NvResult::InvalidState;); - Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count); - session->process->GetMemory().ReadBlock(object->address + cmd_buffer.offset, cmdlist.data(), - cmdlist.size() * sizeof(u32)); - gpu.PushCommandBuffer(core.Host1xDeviceFile().fd_to_id[fd], cmdlist); + Core::Memory::CpuGuestMemory + cmdlist(session->process->GetMemory(), object->address + cmd_buffer.offset, + cmd_buffer.word_count); + host1x.PushEntries(fd, std::move(cmdlist)); } + // Some games expect command_buffers to be written back offset = 0; offset += WriteVectors(data, command_buffers, offset); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h index 900db81d2..63e637760 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h @@ -119,6 +119,7 @@ protected: Kernel::KEvent* QueryEvent(u32 event_id) override; + Tegra::Host1x::Host1x& host1x; u32 channel_syncpoint; s32_le nvmap_fd{}; u32_le submit_timeout{}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index bf090f5eb..8219a2c7e 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp @@ -7,6 +7,7 @@ #include "core/hle/service/nvdrv/core/container.h" #include "core/hle/service/nvdrv/devices/ioctl_serialization.h" #include "core/hle/service/nvdrv/devices/nvhost_vic.h" +#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace Service::Nvidia::Devices { @@ -21,13 +22,8 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span inpu switch (command.group) { case 0x0: switch (command.cmd) { - case 0x1: { - auto& host1x_file = core.Host1xDeviceFile(); - if (!host1x_file.fd_to_id.contains(fd)) { - host1x_file.fd_to_id[fd] = host1x_file.vic_next_id++; - } + case 0x1: return WrapFixedVariable(this, &nvhost_vic::Submit, input, output, fd); - } case 0x2: return WrapFixed(this, &nvhost_vic::GetSyncpoint, input, output); case 0x3: @@ -70,14 +66,11 @@ NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span inpu void nvhost_vic::OnOpen(NvCore::SessionId session_id, DeviceFD fd) { sessions[fd] = session_id; + host1x.StartDevice(fd, Tegra::Host1x::ChannelType::VIC, channel_syncpoint); } void nvhost_vic::OnClose(DeviceFD fd) { - auto& host1x_file = core.Host1xDeviceFile(); - const auto iter = host1x_file.fd_to_id.find(fd); - if (iter != host1x_file.fd_to_id.end()) { - system.GPU().ClearCdmaInstance(iter->second); - } + host1x.StopDevice(fd, Tegra::Host1x::ChannelType::VIC); sessions.erase(fd); } diff --git a/src/core/hle/service/am/process.cpp b/src/core/hle/service/os/process.cpp similarity index 58% rename from src/core/hle/service/am/process.cpp rename to src/core/hle/service/os/process.cpp index 388d2045c..0dbadc315 100644 --- a/src/core/hle/service/am/process.cpp +++ b/src/core/hle/service/os/process.cpp @@ -3,66 +3,25 @@ #include "common/scope_exit.h" -#include "core/file_sys/content_archive.h" -#include "core/file_sys/nca_metadata.h" -#include "core/file_sys/registered_cache.h" #include "core/hle/kernel/k_process.h" -#include "core/hle/service/am/process.h" -#include "core/hle/service/filesystem/filesystem.h" +#include "core/hle/kernel/svc_types.h" +#include "core/hle/service/os/process.h" #include "core/loader/loader.h" -namespace Service::AM { +namespace Service { Process::Process(Core::System& system) : m_system(system), m_process(), m_main_thread_priority(), m_main_thread_stack_size(), - m_program_id(), m_process_started() {} + m_process_started() {} Process::~Process() { this->Finalize(); } -bool Process::Initialize(u64 program_id, u8 minimum_key_generation, u8 maximum_key_generation) { +bool Process::Initialize(Loader::AppLoader& loader, Loader::ResultStatus& out_load_result) { // First, ensure we are not holding another process. this->Finalize(); - // Get the filesystem controller. - auto& fsc = m_system.GetFileSystemController(); - - // Attempt to load program NCA. - const FileSys::RegisteredCache* bis_system{}; - FileSys::VirtualFile nca_raw{}; - - // Get the program NCA from built-in storage. - bis_system = fsc.GetSystemNANDContents(); - if (bis_system) { - nca_raw = bis_system->GetEntryRaw(program_id, FileSys::ContentRecordType::Program); - } - - // Ensure we retrieved a program NCA. - if (!nca_raw) { - return false; - } - - // Ensure we have a suitable version. - if (minimum_key_generation > 0) { - FileSys::NCA nca(nca_raw); - if (nca.GetStatus() == Loader::ResultStatus::Success && - (nca.GetKeyGeneration() < minimum_key_generation || - nca.GetKeyGeneration() > maximum_key_generation)) { - LOG_WARNING(Service_LDR, "Skipping program {:016X} with generation {}", program_id, - nca.GetKeyGeneration()); - return false; - } - } - - // Get the appropriate loader to parse this NCA. - auto app_loader = Loader::GetLoader(m_system, nca_raw, program_id, 0); - - // Ensure we have a loader which can parse the NCA. - if (!app_loader) { - return false; - } - // Create the process. auto* const process = Kernel::KProcess::Create(m_system.Kernel()); Kernel::KProcess::Register(m_system.Kernel(), process); @@ -73,7 +32,8 @@ bool Process::Initialize(u64 program_id, u8 minimum_key_generation, u8 maximum_k }; // Insert process modules into memory. - const auto [load_result, load_parameters] = app_loader->Load(*process, m_system); + const auto [load_result, load_parameters] = loader.Load(*process, m_system); + out_load_result = load_result; // Ensure loading was successful. if (load_result != Loader::ResultStatus::Success) { @@ -114,7 +74,6 @@ void Process::Finalize() { m_process = nullptr; m_main_thread_priority = 0; m_main_thread_stack_size = 0; - m_program_id = 0; m_process_started = false; } @@ -142,6 +101,31 @@ void Process::Terminate() { } } +void Process::ResetSignal() { + if (m_process) { + m_process->Reset(); + } +} + +bool Process::IsRunning() const { + if (m_process) { + const auto state = m_process->GetState(); + return state == Kernel::KProcess::State::Running || + state == Kernel::KProcess::State::RunningAttached || + state == Kernel::KProcess::State::DebugBreak; + } + + return false; +} + +bool Process::IsTerminated() const { + if (m_process) { + return m_process->IsTerminated(); + } + + return false; +} + u64 Process::GetProcessId() const { if (m_process) { return m_process->GetProcessId(); @@ -150,4 +134,19 @@ u64 Process::GetProcessId() const { return 0; } -} // namespace Service::AM +u64 Process::GetProgramId() const { + if (m_process) { + return m_process->GetProgramId(); + } + + return 0; +} + +void Process::Suspend(bool suspended) { + if (m_process) { + m_process->SetActivity(suspended ? Kernel::Svc::ProcessActivity::Paused + : Kernel::Svc::ProcessActivity::Runnable); + } +} + +} // namespace Service diff --git a/src/core/hle/service/am/process.h b/src/core/hle/service/os/process.h similarity index 62% rename from src/core/hle/service/am/process.h rename to src/core/hle/service/os/process.h index 4b8102fb6..9109b7d0a 100644 --- a/src/core/hle/service/am/process.h +++ b/src/core/hle/service/os/process.h @@ -3,38 +3,47 @@ #pragma once -#include "common/common_funcs.h" #include "common/common_types.h" -namespace Kernel { -class KProcess; -} - namespace Core { class System; } -namespace Service::AM { +namespace Loader { +class AppLoader; +enum class ResultStatus : u16; +} // namespace Loader + +namespace Kernel { +class KProcess; +} + +namespace Service { class Process { public: explicit Process(Core::System& system); ~Process(); - bool Initialize(u64 program_id, u8 minimum_key_generation, u8 maximum_key_generation); + bool Initialize(Loader::AppLoader& loader, Loader::ResultStatus& out_load_result); void Finalize(); bool Run(); void Terminate(); + void Suspend(bool suspended); + void ResetSignal(); bool IsInitialized() const { return m_process != nullptr; } + + bool IsRunning() const; + bool IsTerminated() const; + u64 GetProcessId() const; - u64 GetProgramId() const { - return m_program_id; - } - Kernel::KProcess* GetProcess() const { + u64 GetProgramId() const; + + Kernel::KProcess* GetHandle() const { return m_process; } @@ -43,8 +52,7 @@ private: Kernel::KProcess* m_process{}; s32 m_main_thread_priority{}; u64 m_main_thread_stack_size{}; - u64 m_program_id{}; bool m_process_started{}; }; -} // namespace Service::AM +} // namespace Service diff --git a/src/core/loader/loader.cpp b/src/core/loader/loader.cpp index b6e355622..6aabdc75e 100644 --- a/src/core/loader/loader.cpp +++ b/src/core/loader/loader.cpp @@ -36,22 +36,23 @@ std::optional IdentifyFileLoader(FileSys::VirtualFile file) { } // namespace FileType IdentifyFile(FileSys::VirtualFile file) { - if (const auto romdir_type = IdentifyFileLoader(file)) { - return *romdir_type; - } else if (const auto nso_type = IdentifyFileLoader(file)) { - return *nso_type; + if (const auto nsp_type = IdentifyFileLoader(file)) { + return *nsp_type; + } else if (const auto xci_type = IdentifyFileLoader(file)) { + return *xci_type; } else if (const auto nro_type = IdentifyFileLoader(file)) { return *nro_type; } else if (const auto nca_type = IdentifyFileLoader(file)) { return *nca_type; - } else if (const auto xci_type = IdentifyFileLoader(file)) { - return *xci_type; } else if (const auto nax_type = IdentifyFileLoader(file)) { return *nax_type; - } else if (const auto nsp_type = IdentifyFileLoader(file)) { - return *nsp_type; } else if (const auto kip_type = IdentifyFileLoader(file)) { return *kip_type; + } else if (const auto nso_type = IdentifyFileLoader(file)) { + return *nso_type; + } else if (const auto romdir_type = + IdentifyFileLoader(file)) { + return *romdir_type; } else { return FileType::Unknown; } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 8775369a4..6f7703fce 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -49,8 +49,7 @@ struct Memory::Impl { void SetCurrentPageTable(Kernel::KProcess& process) { current_page_table = &process.GetPageTable().GetImpl(); - if (std::addressof(process) == system.ApplicationProcess() && - Settings::IsFastmemEnabled()) { + if (process.IsApplication() && Settings::IsFastmemEnabled()) { current_page_table->fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer(); } else { current_page_table->fastmem_arena = nullptr; diff --git a/src/core/memory.h b/src/core/memory.h index f7e6b297f..dcca26892 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -64,6 +64,8 @@ public: Memory(Memory&&) = default; Memory& operator=(Memory&&) = delete; + static constexpr bool HAS_FLUSH_INVALIDATION = false; + /** * Resets the state of the Memory system. */ diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 2de2beb6e..ed8bf6c00 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -60,8 +60,8 @@ add_library(video_core STATIC framebuffer_config.h fsr.cpp fsr.h - host1x/codecs/codec.cpp - host1x/codecs/codec.h + host1x/codecs/decoder.cpp + host1x/codecs/decoder.h host1x/codecs/h264.cpp host1x/codecs/h264.h host1x/codecs/vp8.cpp @@ -80,8 +80,6 @@ add_library(video_core STATIC host1x/nvdec.cpp host1x/nvdec.h host1x/nvdec_common.h - host1x/sync_manager.cpp - host1x/sync_manager.h host1x/syncpoint_manager.cpp host1x/syncpoint_manager.h host1x/vic.cpp @@ -374,6 +372,10 @@ if (ARCHITECTURE_x86_64) macro/macro_jit_x64.h ) target_link_libraries(video_core PUBLIC xbyak::xbyak) + + if (NOT MSVC) + target_compile_options(video_core PRIVATE -msse4.1) + endif() endif() if (ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64) @@ -392,4 +394,8 @@ if (ANDROID AND ARCHITECTURE_arm64) target_link_libraries(video_core PRIVATE adrenotools) endif() +if (ARCHITECTURE_arm64) + target_link_libraries(video_core PRIVATE sse2neon) +endif() + create_target_directory_groups(video_core) diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 28a2d2090..3bcf1b066 100644 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -2,136 +2,130 @@ // SPDX-License-Identifier: MIT #include + +#include "common/thread.h" +#include "core/core.h" #include "video_core/cdma_pusher.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/host1x/control.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" #include "video_core/host1x/nvdec_common.h" -#include "video_core/host1x/sync_manager.h" #include "video_core/host1x/vic.h" #include "video_core/memory_manager.h" namespace Tegra { -CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_) - : host1x{host1x_}, nvdec_processor(std::make_shared(host1x)), - vic_processor(std::make_unique(host1x, nvdec_processor)), - host1x_processor(std::make_unique(host1x)), - sync_manager(std::make_unique(host1x)) {} + +CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_, s32 id) + : host1x{host1x_}, memory_manager{host1x.GMMU()}, + host_processor{std::make_unique(host1x_)}, current_class{ + static_cast(id)} { + thread = std::jthread([this](std::stop_token stop_token) { ProcessEntries(stop_token); }); +} CDmaPusher::~CDmaPusher() = default; -void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) { - for (const auto& value : entries) { - if (mask != 0) { - const auto lbs = static_cast(std::countr_zero(mask)); - mask &= ~(1U << lbs); - ExecuteCommand(offset + lbs, value.raw); - continue; - } else if (count != 0) { - --count; - ExecuteCommand(offset, value.raw); - if (incrementing) { - ++offset; +void CDmaPusher::ProcessEntries(std::stop_token stop_token) { + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); + ChCommandHeaderList command_list{host1x.System().ApplicationMemory(), 0, 0}; + u32 count{}; + u32 method_offset{}; + u32 mask{}; + bool incrementing{}; + + while (!stop_token.stop_requested()) { + { + std::unique_lock l{command_mutex}; + Common::CondvarWait(command_cv, l, stop_token, + [this]() { return command_lists.size() > 0; }); + if (stop_token.stop_requested()) { + return; } - continue; + + command_list = std::move(command_lists.front()); + command_lists.pop_front(); } - const auto mode = value.submission_mode.Value(); - switch (mode) { - case ChSubmissionMode::SetClass: { - mask = value.value & 0x3f; - offset = value.method_offset; - current_class = static_cast((value.value >> 6) & 0x3ff); - break; - } - case ChSubmissionMode::Incrementing: - case ChSubmissionMode::NonIncrementing: - count = value.value; - offset = value.method_offset; - incrementing = mode == ChSubmissionMode::Incrementing; - break; - case ChSubmissionMode::Mask: - mask = value.value; - offset = value.method_offset; - break; - case ChSubmissionMode::Immediate: { - const u32 data = value.value & 0xfff; - offset = value.method_offset; - ExecuteCommand(offset, data); - break; - } - default: - UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast(mode)); - break; + + size_t i = 0; + for (const auto value : command_list) { + i++; + if (mask != 0) { + const auto lbs = static_cast(std::countr_zero(mask)); + mask &= ~(1U << lbs); + ExecuteCommand(method_offset + lbs, value.raw); + continue; + } else if (count != 0) { + --count; + ExecuteCommand(method_offset, value.raw); + if (incrementing) { + ++method_offset; + } + continue; + } + const auto mode = value.submission_mode.Value(); + switch (mode) { + case ChSubmissionMode::SetClass: { + mask = value.value & 0x3f; + method_offset = value.method_offset; + current_class = static_cast((value.value >> 6) & 0x3ff); + break; + } + case ChSubmissionMode::Incrementing: + case ChSubmissionMode::NonIncrementing: + count = value.value; + method_offset = value.method_offset; + incrementing = mode == ChSubmissionMode::Incrementing; + break; + case ChSubmissionMode::Mask: + mask = value.value; + method_offset = value.method_offset; + break; + case ChSubmissionMode::Immediate: { + const u32 data = value.value & 0xfff; + method_offset = value.method_offset; + ExecuteCommand(method_offset, data); + break; + } + default: + LOG_ERROR(HW_GPU, "Bad command at index {} (bytes 0x{:X}), buffer size {}", i - 1, + (i - 1) * sizeof(u32), command_list.size()); + UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", + static_cast(mode)); + break; + } } } } -void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { +void CDmaPusher::ExecuteCommand(u32 method, u32 arg) { switch (current_class) { - case ChClassId::NvDec: - ThiStateWrite(nvdec_thi_state, offset, data); - switch (static_cast(offset)) { - case ThiMethod::IncSyncpt: { - LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); - const auto syncpoint_id = static_cast(data & 0xFF); - const auto cond = static_cast((data >> 8) & 0xFF); - if (cond == 0) { - sync_manager->Increment(syncpoint_id); - } else { - sync_manager->SignalDone( - sync_manager->IncrementWhenDone(static_cast(current_class), syncpoint_id)); - } - break; - } - case ThiMethod::SetMethod1: - LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", - static_cast(nvdec_thi_state.method_0)); - nvdec_processor->ProcessMethod(nvdec_thi_state.method_0, data); - break; - default: - break; - } - break; - case ChClassId::GraphicsVic: - ThiStateWrite(vic_thi_state, static_cast(state_offset), {data}); - switch (static_cast(state_offset)) { - case ThiMethod::IncSyncpt: { - LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method"); - const auto syncpoint_id = static_cast(data & 0xFF); - const auto cond = static_cast((data >> 8) & 0xFF); - if (cond == 0) { - sync_manager->Increment(syncpoint_id); - } else { - sync_manager->SignalDone( - sync_manager->IncrementWhenDone(static_cast(current_class), syncpoint_id)); - } - break; - } - case ThiMethod::SetMethod1: - LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", - static_cast(vic_thi_state.method_0), data); - vic_processor->ProcessMethod(static_cast(vic_thi_state.method_0), - data); - break; - default: - break; - } - break; case ChClassId::Control: - // This device is mainly for syncpoint synchronization - LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); - host1x_processor->ProcessMethod(static_cast(offset), data); + LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}", + static_cast(current_class), method, arg); + host_processor->ProcessMethod(static_cast(method), arg); break; default: - UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast(current_class)); - break; + thi_regs.reg_array[method] = arg; + switch (static_cast(method)) { + case ThiMethod::IncSyncpt: { + const auto syncpoint_id = static_cast(arg & 0xFF); + [[maybe_unused]] const auto cond = static_cast((arg >> 8) & 0xFF); + LOG_TRACE(Service_NVDRV, "Class {} IncSyncpt Method, syncpt {} cond {}", + static_cast(current_class), syncpoint_id, cond); + auto& syncpoint_manager = host1x.GetSyncpointManager(); + syncpoint_manager.IncrementGuest(syncpoint_id); + syncpoint_manager.IncrementHost(syncpoint_id); + break; + } + case ThiMethod::SetMethod1: + LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}", + static_cast(current_class), static_cast(thi_regs.method_0), arg); + ProcessMethod(thi_regs.method_0, arg); + break; + default: + break; + } } } -void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) { - u8* const offset_ptr = reinterpret_cast(&state) + sizeof(u32) * state_offset; - std::memcpy(offset_ptr, &argument, sizeof(u32)); -} - } // namespace Tegra diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 7d660af47..becbccef1 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -3,12 +3,18 @@ #pragma once +#include +#include #include +#include +#include #include #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "common/polyfill_thread.h" +#include "core/memory.h" namespace Tegra { @@ -62,23 +68,31 @@ struct ChCommand { std::vector arguments; }; -using ChCommandHeaderList = std::vector; +using ChCommandHeaderList = + Core::Memory::CpuGuestMemory; struct ThiRegisters { - u32_le increment_syncpt{}; - INSERT_PADDING_WORDS(1); - u32_le increment_syncpt_error{}; - u32_le ctx_switch_incremement_syncpt{}; - INSERT_PADDING_WORDS(4); - u32_le ctx_switch{}; - INSERT_PADDING_WORDS(1); - u32_le ctx_syncpt_eof{}; - INSERT_PADDING_WORDS(5); - u32_le method_0{}; - u32_le method_1{}; - INSERT_PADDING_WORDS(12); - u32_le int_status{}; - u32_le int_mask{}; + static constexpr std::size_t NUM_REGS = 0x20; + + union { + struct { + u32_le increment_syncpt; + INSERT_PADDING_WORDS_NOINIT(1); + u32_le increment_syncpt_error; + u32_le ctx_switch_incremement_syncpt; + INSERT_PADDING_WORDS_NOINIT(4); + u32_le ctx_switch; + INSERT_PADDING_WORDS_NOINIT(1); + u32_le ctx_syncpt_eof; + INSERT_PADDING_WORDS_NOINIT(5); + u32_le method_0; + u32_le method_1; + INSERT_PADDING_WORDS_NOINIT(12); + u32_le int_status; + u32_le int_mask; + }; + std::array reg_array; + }; }; enum class ThiMethod : u32 { @@ -89,32 +103,39 @@ enum class ThiMethod : u32 { class CDmaPusher { public: - explicit CDmaPusher(Host1x::Host1x& host1x); - ~CDmaPusher(); + CDmaPusher() = delete; + virtual ~CDmaPusher(); - /// Process the command entry - void ProcessEntries(ChCommandHeaderList&& entries); + void PushEntries(ChCommandHeaderList&& entries) { + std::scoped_lock l{command_mutex}; + command_lists.push_back(std::move(entries)); + command_cv.notify_one(); + } + +protected: + explicit CDmaPusher(Host1x::Host1x& host1x, s32 id); + + virtual void ProcessMethod(u32 method, u32 arg) = 0; + + Host1x::Host1x& host1x; + Tegra::MemoryManager& memory_manager; private: + /// Process the command entry + void ProcessEntries(std::stop_token stop_token); + /// Invoke command class devices to execute the command based on the current state void ExecuteCommand(u32 state_offset, u32 data); - /// Write arguments value to the ThiRegisters member at the specified offset - void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument); + std::unique_ptr host_processor; - Host1x::Host1x& host1x; - std::shared_ptr nvdec_processor; - std::unique_ptr vic_processor; - std::unique_ptr host1x_processor; - std::unique_ptr sync_manager; - ChClassId current_class{}; - ThiRegisters vic_thi_state{}; - ThiRegisters nvdec_thi_state{}; + std::mutex command_mutex; + std::condition_variable_any command_cv; + std::deque command_lists; + std::jthread thread; - u32 count{}; - u32 offset{}; - u32 mask{}; - bool incrementing{}; + ThiRegisters thi_regs{}; + ChClassId current_class; }; } // namespace Tegra diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 6d0b32339..c816f47fe 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -250,30 +250,6 @@ struct GPU::Impl { gpu_thread.SubmitList(channel, std::move(entries)); } - /// Push GPU command buffer entries to be processed - void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) { - if (!use_nvdec) { - return; - } - - if (!cdma_pushers.contains(id)) { - cdma_pushers.insert_or_assign(id, std::make_unique(host1x)); - } - - // SubmitCommandBuffer would make the nvdec operations async, this is not currently working - // TODO(ameerj): RE proper async nvdec operation - // gpu_thread.SubmitCommandBuffer(std::move(entries)); - cdma_pushers[id]->ProcessEntries(std::move(entries)); - } - - /// Frees the CDMAPusher instance to free up resources - void ClearCdmaInstance(u32 id) { - const auto iter = cdma_pushers.find(id); - if (iter != cdma_pushers.end()) { - cdma_pushers.erase(iter); - } - } - /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory void FlushRegion(DAddr addr, u64 size) { gpu_thread.FlushRegion(addr, size); @@ -362,7 +338,6 @@ struct GPU::Impl { Core::System& system; Host1x::Host1x& host1x; - std::map> cdma_pushers; std::unique_ptr renderer; VideoCore::RasterizerInterface* rasterizer = nullptr; const bool use_nvdec; @@ -556,14 +531,6 @@ void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) { impl->PushGPUEntries(channel, std::move(entries)); } -void GPU::PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) { - impl->PushCommandBuffer(id, entries); -} - -void GPU::ClearCdmaInstance(u32 id) { - impl->ClearCdmaInstance(id); -} - VideoCore::RasterizerDownloadArea GPU::OnCPURead(PAddr addr, u64 size) { return impl->OnCPURead(addr, size); } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 50014e51f..8a06adad7 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -234,15 +234,6 @@ public: /// Push GPU command entries to be processed void PushGPUEntries(s32 channel, Tegra::CommandList&& entries); - /// Push GPU command buffer entries to be processed - void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries); - - /// Frees the CDMAPusher instance to free up resources - void ClearCdmaInstance(u32 id); - - /// Swap buffers (render frame) - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); - /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory [[nodiscard]] VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size); diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 477e11457..e2bfdcd7f 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -12,6 +12,7 @@ #include "video_core/dma_pusher.h" #include "video_core/gpu.h" #include "video_core/gpu_thread.h" +#include "video_core/host1x/host1x.h" #include "video_core/renderer_base.h" namespace VideoCommon::GPUThread { diff --git a/src/video_core/host1x/codecs/decoder.cpp b/src/video_core/host1x/codecs/decoder.cpp new file mode 100755 index 000000000..49a601969 --- /dev/null +++ b/src/video_core/host1x/codecs/decoder.cpp @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "common/settings.h" +#include "video_core/host1x/codecs/decoder.h" +#include "video_core/host1x/host1x.h" +#include "video_core/memory_manager.h" + +namespace Tegra { + +Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_, + Host1x::FrameQueue& frame_queue_) + : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{ + frame_queue_} {} + +Decoder::~Decoder() = default; + +void Decoder::Decode() { + if (!initialized) { + return; + } + + const auto packet_data = ComposeFrame(); + // Send assembled bitstream to decoder. + if (!decode_api.SendPacket(packet_data)) { + return; + } + + // Only receive/store visible frames. + if (vp9_hidden_frame) { + return; + } + + // Receive output frames from decoder. + auto frame = decode_api.ReceiveFrame(); + + if (IsInterlaced()) { + auto [luma_top, luma_bottom, chroma_top, chroma_bottom] = GetInterlacedOffsets(); + auto frame_copy = frame; + + if (!frame.get()) { + LOG_ERROR(HW_GPU, + "Nvdec {} dailed to decode interlaced frame for top 0x{:X} bottom 0x{:X}", id, + luma_top, luma_bottom); + } + + if (UsingDecodeOrder()) { + frame_queue.PushDecodeOrder(id, luma_top, std::move(frame)); + frame_queue.PushDecodeOrder(id, luma_bottom, std::move(frame_copy)); + } else { + frame_queue.PushPresentOrder(id, luma_top, std::move(frame)); + frame_queue.PushPresentOrder(id, luma_bottom, std::move(frame_copy)); + } + } else { + auto [luma_offset, chroma_offset] = GetProgressiveOffsets(); + + if (!frame.get()) { + LOG_ERROR(HW_GPU, "Nvdec {} failed to decode progressive frame for luma 0x{:X}", id, + luma_offset); + } + + if (UsingDecodeOrder()) { + frame_queue.PushDecodeOrder(id, luma_offset, std::move(frame)); + } else { + frame_queue.PushPresentOrder(id, luma_offset, std::move(frame)); + } + } +} + +} // namespace Tegra diff --git a/src/video_core/host1x/codecs/decoder.h b/src/video_core/host1x/codecs/decoder.h new file mode 100755 index 000000000..22e6db815 --- /dev/null +++ b/src/video_core/host1x/codecs/decoder.h @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/common_types.h" +#include "video_core/host1x/ffmpeg/ffmpeg.h" +#include "video_core/host1x/nvdec_common.h" + +namespace Tegra { + +namespace Host1x { +class Host1x; +class FrameQueue; +} // namespace Host1x + +class Decoder { +public: + virtual ~Decoder(); + + /// Call decoders to construct headers, decode AVFrame with ffmpeg + void Decode(); + + bool UsingDecodeOrder() const { + return decode_api.UsingDecodeOrder(); + } + + /// Returns the value of current_codec + [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const { + return codec; + } + + /// Return name of the current codec + [[nodiscard]] virtual std::string_view GetCurrentCodecName() const = 0; + +protected: + explicit Decoder(Host1x::Host1x& host1x, s32 id, + const Host1x::NvdecCommon::NvdecRegisters& regs, + Host1x::FrameQueue& frame_queue); + + virtual std::span ComposeFrame() = 0; + virtual std::tuple GetProgressiveOffsets() = 0; + virtual std::tuple GetInterlacedOffsets() = 0; + virtual bool IsInterlaced() = 0; + + Host1x::Host1x& host1x; + Tegra::MemoryManager& memory_manager; + const Host1x::NvdecCommon::NvdecRegisters& regs; + s32 id; + Host1x::FrameQueue& frame_queue; + Host1x::NvdecCommon::VideoCodec codec; + FFmpeg::DecodeApi decode_api; + bool initialized{}; + bool vp9_hidden_frame{}; +}; + +} // namespace Tegra diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 994591c8d..782d11d72 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Ryujinx Team and Contributors -// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later #include #include @@ -10,7 +10,7 @@ #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoder { +namespace Tegra::Decoders { namespace { // ZigZag LUTs from libavcodec. constexpr std::array zig_zag_direct{ @@ -25,23 +25,56 @@ constexpr std::array zig_zag_scan{ }; } // Anonymous namespace -H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {} +H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, regs_, frame_queue_} { + codec = Host1x::NvdecCommon::VideoCodec::H264; + initialized = decode_api.Initialize(codec); +} H264::~H264() = default; -std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, - size_t* out_configuration_size, bool is_first_frame) { - H264DecoderContext context; - host1x.GMMU().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); +std::tuple H264::GetProgressiveOffsets() { + auto pic_idx{current_context.h264_parameter_set.curr_pic_idx}; + auto luma{regs.surface_luma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.luma_frame_offset.Address()}; + auto chroma{regs.surface_chroma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.chroma_frame_offset.Address()}; + return {luma, chroma}; +} - const s64 frame_number = context.h264_parameter_set.frame_number.Value(); +std::tuple H264::GetInterlacedOffsets() { + auto pic_idx{current_context.h264_parameter_set.curr_pic_idx}; + auto luma_top{regs.surface_luma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.luma_top_offset.Address()}; + auto luma_bottom{regs.surface_luma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.luma_bot_offset.Address()}; + auto chroma_top{regs.surface_chroma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.chroma_top_offset.Address()}; + auto chroma_bottom{regs.surface_chroma_offsets[pic_idx].Address() + + current_context.h264_parameter_set.chroma_bot_offset.Address()}; + return {luma_top, luma_bottom, chroma_top, chroma_bottom}; +} + +bool H264::IsInterlaced() { + return current_context.h264_parameter_set.luma_top_offset.Address() != 0 || + current_context.h264_parameter_set.luma_bot_offset.Address() != 0; +} + +std::span H264::ComposeFrame() { + memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_context, + sizeof(H264DecoderContext)); + + const s64 frame_number = current_context.h264_parameter_set.frame_number.Value(); if (!is_first_frame && frame_number != 0) { - frame.resize_destructive(context.stream_len); - host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); - *out_configuration_size = 0; - return frame; + frame_scratch.resize_destructive(current_context.stream_len); + memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), frame_scratch.data(), + frame_scratch.size()); + return frame_scratch; } + is_first_frame = false; + // Encode header H264BitWriter writer{}; writer.WriteU(1, 24); @@ -53,7 +86,7 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters writer.WriteU(31, 8); writer.WriteUe(0); const u32 chroma_format_idc = - static_cast(context.h264_parameter_set.chroma_format_idc.Value()); + static_cast(current_context.h264_parameter_set.chroma_format_idc.Value()); writer.WriteUe(chroma_format_idc); if (chroma_format_idc == 3) { writer.WriteBit(false); @@ -61,42 +94,44 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters writer.WriteUe(0); writer.WriteUe(0); - writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag + writer.WriteBit(current_context.qpprime_y_zero_transform_bypass_flag.Value() != 0); writer.WriteBit(false); // Scaling matrix present flag - writer.WriteUe(static_cast(context.h264_parameter_set.log2_max_frame_num_minus4.Value())); + writer.WriteUe( + static_cast(current_context.h264_parameter_set.log2_max_frame_num_minus4.Value())); const auto order_cnt_type = - static_cast(context.h264_parameter_set.pic_order_cnt_type.Value()); + static_cast(current_context.h264_parameter_set.pic_order_cnt_type.Value()); writer.WriteUe(order_cnt_type); if (order_cnt_type == 0) { - writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); + writer.WriteUe(current_context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4); } else if (order_cnt_type == 1) { - writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); writer.WriteSe(0); writer.WriteSe(0); writer.WriteUe(0); } - const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units / - (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + const s32 pic_height = current_context.h264_parameter_set.frame_height_in_mbs / + (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); - // TODO (ameerj): Where do we get this number, it seems to be particular for each stream - const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue(); - const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu; - const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u; + u32 max_num_ref_frames = + std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active, + current_context.h264_parameter_set.num_refidx_l1_default_active) + + 1, + 4); writer.WriteUe(max_num_ref_frames); writer.WriteBit(false); - writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); + writer.WriteUe(current_context.h264_parameter_set.pic_width_in_mbs - 1); writer.WriteUe(pic_height - 1); - writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.frame_mbs_only_flag != 0); - if (!context.h264_parameter_set.frame_mbs_only_flag) { - writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0); + if (!current_context.h264_parameter_set.frame_mbs_only_flag) { + writer.WriteBit(current_context.h264_parameter_set.flags.mbaff_frame.Value() != 0); } - writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); + writer.WriteBit(current_context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0); writer.WriteBit(false); // Frame cropping flag writer.WriteBit(false); // VUI parameter present flag @@ -111,57 +146,59 @@ std::span H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters writer.WriteUe(0); writer.WriteUe(0); - writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); - writer.WriteBit(context.h264_parameter_set.pic_order_present_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.entropy_coding_mode_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.pic_order_present_flag != 0); writer.WriteUe(0); - writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); - writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); - writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0); - writer.WriteU(static_cast(context.h264_parameter_set.weighted_bipred_idc.Value()), 2); - s32 pic_init_qp = static_cast(context.h264_parameter_set.pic_init_qp_minus26.Value()); + writer.WriteUe(current_context.h264_parameter_set.num_refidx_l0_default_active); + writer.WriteUe(current_context.h264_parameter_set.num_refidx_l1_default_active); + writer.WriteBit(current_context.h264_parameter_set.flags.weighted_pred.Value() != 0); + writer.WriteU(static_cast(current_context.h264_parameter_set.weighted_bipred_idc.Value()), + 2); + s32 pic_init_qp = + static_cast(current_context.h264_parameter_set.pic_init_qp_minus26.Value()); writer.WriteSe(pic_init_qp); writer.WriteSe(0); s32 chroma_qp_index_offset = - static_cast(context.h264_parameter_set.chroma_qp_index_offset.Value()); + static_cast(current_context.h264_parameter_set.chroma_qp_index_offset.Value()); writer.WriteSe(chroma_qp_index_offset); - writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0); - writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); - writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); - writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.deblocking_filter_control_present_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0); + writer.WriteBit(current_context.h264_parameter_set.redundant_pic_cnt_present_flag != 0); + writer.WriteBit(current_context.h264_parameter_set.transform_8x8_mode_flag != 0); writer.WriteBit(true); // pic_scaling_matrix_present_flag for (s32 index = 0; index < 6; index++) { writer.WriteBit(true); - std::span matrix{context.weight_scale}; - writer.WriteScalingList(scan, matrix, index * 16, 16); + std::span matrix{current_context.weight_scale_4x4}; + writer.WriteScalingList(scan_scratch, matrix, index * 16, 16); } - if (context.h264_parameter_set.transform_8x8_mode_flag) { + if (current_context.h264_parameter_set.transform_8x8_mode_flag) { for (s32 index = 0; index < 2; index++) { writer.WriteBit(true); - std::span matrix{context.weight_scale_8x8}; - writer.WriteScalingList(scan, matrix, index * 64, 64); + std::span matrix{current_context.weight_scale_8x8}; + writer.WriteScalingList(scan_scratch, matrix, index * 64, 64); } } s32 chroma_qp_index_offset2 = - static_cast(context.h264_parameter_set.second_chroma_qp_index_offset.Value()); + static_cast(current_context.h264_parameter_set.second_chroma_qp_index_offset.Value()); writer.WriteSe(chroma_qp_index_offset2); writer.End(); const auto& encoded_header = writer.GetByteArray(); - frame.resize(encoded_header.size() + context.stream_len); - std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + frame_scratch.resize(encoded_header.size() + current_context.stream_len); + std::memcpy(frame_scratch.data(), encoded_header.data(), encoded_header.size()); - *out_configuration_size = encoded_header.size(); - host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(), - context.stream_len); + memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), + frame_scratch.data() + encoded_header.size(), + current_context.stream_len); - return frame; + return frame_scratch; } H264BitWriter::H264BitWriter() = default; @@ -278,4 +315,4 @@ void H264BitWriter::Flush() { buffer = 0; buffer_pos = 0; } -} // namespace Tegra::Decoder +} // namespace Tegra::Decoders diff --git a/src/video_core/host1x/codecs/h264.h b/src/video_core/host1x/codecs/h264.h index 1deaf4632..d946c6937 100644 --- a/src/video_core/host1x/codecs/h264.h +++ b/src/video_core/host1x/codecs/h264.h @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Ryujinx Team and Contributors -// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later #pragma once @@ -10,6 +10,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/scratch_buffer.h" +#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -18,7 +19,7 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoder { +namespace Decoders { class H264BitWriter { public: @@ -60,123 +61,213 @@ private: std::vector byte_array; }; -class H264 { -public: - explicit H264(Host1x::Host1x& host1x); - ~H264(); - - /// Compose the H264 frame for FFmpeg decoding - [[nodiscard]] std::span ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state, - size_t* out_configuration_size, - bool is_first_frame = false); +struct Offset { + constexpr u32 Address() const noexcept { + return offset << 8; + } private: - Common::ScratchBuffer frame; - Common::ScratchBuffer scan; - Host1x::Host1x& host1x; + u32 offset; +}; +static_assert(std::is_trivial_v, "Offset must be trivial"); +static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!"); - struct H264ParameterSet { - s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 - s32 delta_pic_order_always_zero_flag; ///< 0x04 - s32 frame_mbs_only_flag; ///< 0x08 - u32 pic_width_in_mbs; ///< 0x0C - u32 frame_height_in_map_units; ///< 0x10 - union { ///< 0x14 - BitField<0, 2, u32> tile_format; - BitField<2, 3, u32> gob_height; - }; - u32 entropy_coding_mode_flag; ///< 0x18 - s32 pic_order_present_flag; ///< 0x1C - s32 num_refidx_l0_default_active; ///< 0x20 - s32 num_refidx_l1_default_active; ///< 0x24 - s32 deblocking_filter_control_present_flag; ///< 0x28 - s32 redundant_pic_cnt_present_flag; ///< 0x2C - u32 transform_8x8_mode_flag; ///< 0x30 - u32 pitch_luma; ///< 0x34 - u32 pitch_chroma; ///< 0x38 - u32 luma_top_offset; ///< 0x3C - u32 luma_bot_offset; ///< 0x40 - u32 luma_frame_offset; ///< 0x44 - u32 chroma_top_offset; ///< 0x48 - u32 chroma_bot_offset; ///< 0x4C - u32 chroma_frame_offset; ///< 0x50 - u32 hist_buffer_size; ///< 0x54 - union { ///< 0x58 - union { - BitField<0, 1, u64> mbaff_frame; - BitField<1, 1, u64> direct_8x8_inference; - BitField<2, 1, u64> weighted_pred; - BitField<3, 1, u64> constrained_intra_pred; - BitField<4, 1, u64> ref_pic; - BitField<5, 1, u64> field_pic; - BitField<6, 1, u64> bottom_field; - BitField<7, 1, u64> second_field; - } flags; - BitField<8, 4, u64> log2_max_frame_num_minus4; - BitField<12, 2, u64> chroma_format_idc; - BitField<14, 2, u64> pic_order_cnt_type; - BitField<16, 6, s64> pic_init_qp_minus26; - BitField<22, 5, s64> chroma_qp_index_offset; - BitField<27, 5, s64> second_chroma_qp_index_offset; - BitField<32, 2, u64> weighted_bipred_idc; - BitField<34, 7, u64> curr_pic_idx; - BitField<41, 5, u64> curr_col_idx; - BitField<46, 16, u64> frame_number; - BitField<62, 1, u64> frame_surfaces; - BitField<63, 1, u64> output_memory_layout; - }; +struct H264ParameterSet { + s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00 + s32 delta_pic_order_always_zero_flag; ///< 0x04 + s32 frame_mbs_only_flag; ///< 0x08 + u32 pic_width_in_mbs; ///< 0x0C + u32 frame_height_in_mbs; ///< 0x10 + union { ///< 0x14 + BitField<0, 2, u32> tile_format; + BitField<2, 3, u32> gob_height; + BitField<5, 27, u32> reserved_surface_format; }; - static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); - - struct H264DecoderContext { - INSERT_PADDING_WORDS_NOINIT(18); ///< 0x0000 - u32 stream_len; ///< 0x0048 - INSERT_PADDING_WORDS_NOINIT(3); ///< 0x004C - H264ParameterSet h264_parameter_set; ///< 0x0058 - INSERT_PADDING_WORDS_NOINIT(66); ///< 0x00B8 - std::array weight_scale; ///< 0x01C0 - std::array weight_scale_8x8; ///< 0x0220 + u32 entropy_coding_mode_flag; ///< 0x18 + s32 pic_order_present_flag; ///< 0x1C + s32 num_refidx_l0_default_active; ///< 0x20 + s32 num_refidx_l1_default_active; ///< 0x24 + s32 deblocking_filter_control_present_flag; ///< 0x28 + s32 redundant_pic_cnt_present_flag; ///< 0x2C + u32 transform_8x8_mode_flag; ///< 0x30 + u32 pitch_luma; ///< 0x34 + u32 pitch_chroma; ///< 0x38 + Offset luma_top_offset; ///< 0x3C + Offset luma_bot_offset; ///< 0x40 + Offset luma_frame_offset; ///< 0x44 + Offset chroma_top_offset; ///< 0x48 + Offset chroma_bot_offset; ///< 0x4C + Offset chroma_frame_offset; ///< 0x50 + u32 hist_buffer_size; ///< 0x54 + union { ///< 0x58 + union { + BitField<0, 1, u64> mbaff_frame; + BitField<1, 1, u64> direct_8x8_inference; + BitField<2, 1, u64> weighted_pred; + BitField<3, 1, u64> constrained_intra_pred; + BitField<4, 1, u64> ref_pic; + BitField<5, 1, u64> field_pic; + BitField<6, 1, u64> bottom_field; + BitField<7, 1, u64> second_field; + } flags; + BitField<8, 4, u64> log2_max_frame_num_minus4; + BitField<12, 2, u64> chroma_format_idc; + BitField<14, 2, u64> pic_order_cnt_type; + BitField<16, 6, s64> pic_init_qp_minus26; + BitField<22, 5, s64> chroma_qp_index_offset; + BitField<27, 5, s64> second_chroma_qp_index_offset; + BitField<32, 2, u64> weighted_bipred_idc; + BitField<34, 7, u64> curr_pic_idx; + BitField<41, 5, u64> curr_col_idx; + BitField<46, 16, u64> frame_number; + BitField<62, 1, u64> frame_surfaces; + BitField<63, 1, u64> output_memory_layout; }; - static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size"); +}; +static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size"); #define ASSERT_POSITION(field_name, position) \ static_assert(offsetof(H264ParameterSet, field_name) == position, \ "Field " #field_name " has invalid position") - ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); - ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); - ASSERT_POSITION(frame_mbs_only_flag, 0x08); - ASSERT_POSITION(pic_width_in_mbs, 0x0C); - ASSERT_POSITION(frame_height_in_map_units, 0x10); - ASSERT_POSITION(tile_format, 0x14); - ASSERT_POSITION(entropy_coding_mode_flag, 0x18); - ASSERT_POSITION(pic_order_present_flag, 0x1C); - ASSERT_POSITION(num_refidx_l0_default_active, 0x20); - ASSERT_POSITION(num_refidx_l1_default_active, 0x24); - ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); - ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); - ASSERT_POSITION(transform_8x8_mode_flag, 0x30); - ASSERT_POSITION(pitch_luma, 0x34); - ASSERT_POSITION(pitch_chroma, 0x38); - ASSERT_POSITION(luma_top_offset, 0x3C); - ASSERT_POSITION(luma_bot_offset, 0x40); - ASSERT_POSITION(luma_frame_offset, 0x44); - ASSERT_POSITION(chroma_top_offset, 0x48); - ASSERT_POSITION(chroma_bot_offset, 0x4C); - ASSERT_POSITION(chroma_frame_offset, 0x50); - ASSERT_POSITION(hist_buffer_size, 0x54); - ASSERT_POSITION(flags, 0x58); +ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00); +ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04); +ASSERT_POSITION(frame_mbs_only_flag, 0x08); +ASSERT_POSITION(pic_width_in_mbs, 0x0C); +ASSERT_POSITION(frame_height_in_mbs, 0x10); +ASSERT_POSITION(tile_format, 0x14); +ASSERT_POSITION(entropy_coding_mode_flag, 0x18); +ASSERT_POSITION(pic_order_present_flag, 0x1C); +ASSERT_POSITION(num_refidx_l0_default_active, 0x20); +ASSERT_POSITION(num_refidx_l1_default_active, 0x24); +ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28); +ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C); +ASSERT_POSITION(transform_8x8_mode_flag, 0x30); +ASSERT_POSITION(pitch_luma, 0x34); +ASSERT_POSITION(pitch_chroma, 0x38); +ASSERT_POSITION(luma_top_offset, 0x3C); +ASSERT_POSITION(luma_bot_offset, 0x40); +ASSERT_POSITION(luma_frame_offset, 0x44); +ASSERT_POSITION(chroma_top_offset, 0x48); +ASSERT_POSITION(chroma_bot_offset, 0x4C); +ASSERT_POSITION(chroma_frame_offset, 0x50); +ASSERT_POSITION(hist_buffer_size, 0x54); +ASSERT_POSITION(flags, 0x58); #undef ASSERT_POSITION +struct DpbEntry { + union { + BitField<0, 7, u32> index; + BitField<7, 5, u32> col_idx; + BitField<12, 2, u32> state; + BitField<14, 1, u32> is_long_term; + BitField<15, 1, u32> non_existing; + BitField<16, 1, u32> is_field; + BitField<17, 4, u32> top_field_marking; + BitField<21, 4, u32> bottom_field_marking; + BitField<25, 1, u32> output_memory_layout; + BitField<26, 6, u32> reserved; + } flags; + std::array field_order_cnt; + u32 frame_idx; +}; +static_assert(sizeof(DpbEntry) == 0x10, "DpbEntry has the wrong size!"); + +struct DisplayParam { + union { + BitField<0, 1, u32> enable_tf_output; + BitField<1, 1, u32> vc1_map_y_flag; + BitField<2, 3, u32> map_y_value; + BitField<5, 1, u32> vc1_map_uv_flag; + BitField<6, 3, u32> map_uv_value; + BitField<9, 8, u32> out_stride; + BitField<17, 3, u32> tiling_format; + BitField<20, 1, u32> output_structure; // 0=frame, 1=field + BitField<21, 11, u32> reserved0; + }; + std::array output_top; + std::array output_bottom; + union { + BitField<0, 1, u32> enable_histogram; + BitField<1, 12, u32> histogram_start_x; + BitField<13, 12, u32> histogram_start_y; + BitField<25, 7, u32> reserved1; + }; + union { + BitField<0, 12, u32> histogram_end_x; + BitField<12, 12, u32> histogram_end_y; + BitField<24, 8, u32> reserved2; + }; +}; +static_assert(sizeof(DisplayParam) == 0x1C, "DisplayParam has the wrong size!"); + +struct H264DecoderContext { + INSERT_PADDING_WORDS_NOINIT(13); ///< 0x0000 + std::array eos; ///< 0x0034 + u8 explicit_eos_present_flag; ///< 0x0044 + u8 hint_dump_en; ///< 0x0045 + INSERT_PADDING_BYTES_NOINIT(2); ///< 0x0046 + u32 stream_len; ///< 0x0048 + u32 slice_count; ///< 0x004C + u32 mbhist_buffer_size; ///< 0x0050 + u32 gptimer_timeout_value; ///< 0x0054 + H264ParameterSet h264_parameter_set; ///< 0x0058 + std::array curr_field_order_cnt; ///< 0x00B8 + std::array dpb; ///< 0x00C0 + std::array weight_scale_4x4; ///< 0x01C0 + std::array weight_scale_8x8; ///< 0x0220 + std::array num_inter_view_refs_lX; ///< 0x02A0 + std::array reserved2; ///< 0x02A2 + std::array, 2> inter_view_refidx_lX; ///< 0x02B0 + union { ///< 0x02D0 + BitField<0, 1, u32> lossless_ipred8x8_filter_enable; + BitField<1, 1, u32> qpprime_y_zero_transform_bypass_flag; + BitField<2, 30, u32> reserved3; + }; + DisplayParam display_param; ///< 0x02D4 + std::array reserved4; ///< 0x02F0 +}; +static_assert(sizeof(H264DecoderContext) == 0x2FC, "H264DecoderContext is an invalid size"); + #define ASSERT_POSITION(field_name, position) \ static_assert(offsetof(H264DecoderContext, field_name) == position, \ "Field " #field_name " has invalid position") - ASSERT_POSITION(stream_len, 0x48); - ASSERT_POSITION(h264_parameter_set, 0x58); - ASSERT_POSITION(weight_scale, 0x1C0); +ASSERT_POSITION(stream_len, 0x48); +ASSERT_POSITION(h264_parameter_set, 0x58); +ASSERT_POSITION(dpb, 0xC0); +ASSERT_POSITION(weight_scale_4x4, 0x1C0); #undef ASSERT_POSITION + +class H264 final : public Decoder { +public: + explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, + Host1x::FrameQueue& frame_queue); + ~H264() override; + + H264(const H264&) = delete; + H264& operator=(const H264&) = delete; + + H264(H264&&) = delete; + H264& operator=(H264&&) = delete; + + /// Compose the H264 frame for FFmpeg decoding + [[nodiscard]] std::span ComposeFrame() override; + + std::tuple GetProgressiveOffsets() override; + std::tuple GetInterlacedOffsets() override; + bool IsInterlaced() override; + + std::string_view GetCurrentCodecName() const override { + return "H264"; + } + +private: + bool is_first_frame{true}; + Common::ScratchBuffer frame_scratch; + Common::ScratchBuffer scan_scratch; + H264DecoderContext current_context{}; }; -} // namespace Decoder +} // namespace Decoders } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp8.cpp b/src/video_core/host1x/codecs/vp8.cpp index be97e3b00..6094f16e0 100644 --- a/src/video_core/host1x/codecs/vp8.cpp +++ b/src/video_core/host1x/codecs/vp8.cpp @@ -7,47 +7,70 @@ #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoder { -VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {} +namespace Tegra::Decoders { +VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, regs_, frame_queue_} { + codec = Host1x::NvdecCommon::VideoCodec::VP8; + initialized = decode_api.Initialize(codec); +} VP8::~VP8() = default; -std::span VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { - VP8PictureInfo info; - host1x.GMMU().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo)); +std::tuple VP8::GetProgressiveOffsets() { + auto luma{regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto chroma{regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + return {luma, chroma}; +} - const bool is_key_frame = info.key_frame == 1u; - const auto bitstream_size = static_cast(info.vld_buffer_size); +std::tuple VP8::GetInterlacedOffsets() { + auto luma_top{regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto luma_bottom{ + regs.surface_luma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto chroma_top{ + regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + auto chroma_bottom{ + regs.surface_chroma_offsets[static_cast(Vp8SurfaceIndex::Current)].Address()}; + return {luma_top, luma_bottom, chroma_top, chroma_bottom}; +} + +std::span VP8::ComposeFrame() { + memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_context, + sizeof(VP8PictureInfo)); + + const bool is_key_frame = current_context.key_frame == 1u; + const auto bitstream_size = static_cast(current_context.vld_buffer_size); const size_t header_size = is_key_frame ? 10u : 3u; - frame.resize(header_size + bitstream_size); + frame_scratch.resize(header_size + bitstream_size); // Based on page 30 of the VP8 specification. // https://datatracker.ietf.org/doc/rfc6386/ - frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). - frame[0] |= static_cast((info.version & 7u) << 1u); // 3-bit version number - frame[0] |= static_cast(1u << 4u); // 1-bit show_frame flag + frame_scratch[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes). + frame_scratch[0] |= + static_cast((current_context.version & 7u) << 1u); // 3-bit version number + frame_scratch[0] |= static_cast(1u << 4u); // 1-bit show_frame flag // The next 19-bits are the first partition size - frame[0] |= static_cast((info.first_part_size & 7u) << 5u); - frame[1] = static_cast((info.first_part_size & 0x7f8u) >> 3u); - frame[2] = static_cast((info.first_part_size & 0x7f800u) >> 11u); + frame_scratch[0] |= static_cast((current_context.first_part_size & 7u) << 5u); + frame_scratch[1] = static_cast((current_context.first_part_size & 0x7f8u) >> 3u); + frame_scratch[2] = static_cast((current_context.first_part_size & 0x7f800u) >> 11u); if (is_key_frame) { - frame[3] = 0x9du; - frame[4] = 0x01u; - frame[5] = 0x2au; + frame_scratch[3] = 0x9du; + frame_scratch[4] = 0x01u; + frame_scratch[5] = 0x2au; // TODO(ameerj): Horizontal/Vertical Scale // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits) - frame[6] = static_cast(info.frame_width & 0xff); - frame[7] = static_cast(((info.frame_width >> 8) & 0x3f)); + frame_scratch[6] = static_cast(current_context.frame_width & 0xff); + frame_scratch[7] = static_cast(((current_context.frame_width >> 8) & 0x3f)); // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits) - frame[8] = static_cast(info.frame_height & 0xff); - frame[9] = static_cast(((info.frame_height >> 8) & 0x3f)); + frame_scratch[8] = static_cast(current_context.frame_height & 0xff); + frame_scratch[9] = static_cast(((current_context.frame_height >> 8) & 0x3f)); } - const u64 bitstream_offset = state.frame_bitstream_offset; - host1x.GMMU().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size); + const u64 bitstream_offset = regs.frame_bitstream_offset.Address(); + memory_manager.ReadBlock(bitstream_offset, frame_scratch.data() + header_size, bitstream_size); - return frame; + return frame_scratch; } -} // namespace Tegra::Decoder +} // namespace Tegra::Decoders diff --git a/src/video_core/host1x/codecs/vp8.h b/src/video_core/host1x/codecs/vp8.h index 5945e4658..74800281d 100644 --- a/src/video_core/host1x/codecs/vp8.h +++ b/src/video_core/host1x/codecs/vp8.h @@ -9,6 +9,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/scratch_buffer.h" +#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/nvdec_common.h" namespace Tegra { @@ -17,20 +18,41 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoder { +namespace Decoders { +enum class Vp8SurfaceIndex : u32 { + Last = 0, + Golden = 1, + AltRef = 2, + Current = 3, +}; -class VP8 { +class VP8 final : public Decoder { public: - explicit VP8(Host1x::Host1x& host1x); - ~VP8(); + explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, + Host1x::FrameQueue& frame_queue); + ~VP8() override; - /// Compose the VP8 frame for FFmpeg decoding - [[nodiscard]] std::span ComposeFrame( - const Host1x::NvdecCommon::NvdecRegisters& state); + VP8(const VP8&) = delete; + VP8& operator=(const VP8&) = delete; + + VP8(VP8&&) = delete; + VP8& operator=(VP8&&) = delete; + + [[nodiscard]] std::span ComposeFrame() override; + + std::tuple GetProgressiveOffsets() override; + std::tuple GetInterlacedOffsets() override; + + bool IsInterlaced() override { + return false; + } + + std::string_view GetCurrentCodecName() const override { + return "VP8"; + } private: - Common::ScratchBuffer frame; - Host1x::Host1x& host1x; + Common::ScratchBuffer frame_scratch; struct VP8PictureInfo { INSERT_PADDING_WORDS_NOINIT(14); @@ -73,7 +95,9 @@ private: INSERT_PADDING_WORDS_NOINIT(3); }; static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size"); + + VP8PictureInfo current_context{}; }; -} // namespace Decoder +} // namespace Decoders } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9.cpp b/src/video_core/host1x/codecs/vp9.cpp index 65d6fb2d5..c70d0a506 100644 --- a/src/video_core/host1x/codecs/vp9.cpp +++ b/src/video_core/host1x/codecs/vp9.cpp @@ -4,12 +4,13 @@ #include // for std::copy #include +#include "common/alignment.h" #include "common/assert.h" #include "video_core/host1x/codecs/vp9.h" #include "video_core/host1x/host1x.h" #include "video_core/memory_manager.h" -namespace Tegra::Decoder { +namespace Tegra::Decoders { namespace { constexpr u32 diff_update_probability = 252; constexpr u32 frame_sync_code = 0x498342; @@ -237,7 +238,12 @@ constexpr std::array map_lut{ } } // Anonymous namespace -VP9::VP9(Host1x::Host1x& host1x_) : host1x{host1x_} {} +VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_, + Host1x::FrameQueue& frame_queue_) + : Decoder{host1x_, id_, regs_, frame_queue_} { + codec = Host1x::NvdecCommon::VideoCodec::VP9; + initialized = decode_api.Initialize(codec); +} VP9::~VP9() = default; @@ -356,35 +362,113 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_ } } -Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) { - PictureInfo picture_info; - host1x.GMMU().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); - Vp9PictureInfo vp9_info = picture_info.Convert(); +void VP9::WriteSegmentation(VpxBitStreamWriter& writer) { + bool enabled = current_picture_info.segmentation.enabled != 0; + writer.WriteBit(enabled); + if (!enabled) { + return; + } - InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); + auto update_map = current_picture_info.segmentation.update_map != 0; + writer.WriteBit(update_map); + + if (update_map) { + EntropyProbs entropy_probs{}; + memory_manager.ReadBlock(regs.vp9_prob_tab_buffer_offset.Address(), &entropy_probs, + sizeof(entropy_probs)); + + auto WriteProb = [&](u8 prob) { + bool coded = prob != 255; + writer.WriteBit(coded); + if (coded) { + writer.WriteU(prob, 8); + } + }; + + for (size_t i = 0; i < entropy_probs.mb_segment_tree_probs.size(); i++) { + WriteProb(entropy_probs.mb_segment_tree_probs[i]); + } + + auto temporal_update = current_picture_info.segmentation.temporal_update != 0; + writer.WriteBit(temporal_update); + + if (temporal_update) { + for (s32 i = 0; i < 3; i++) { + WriteProb(entropy_probs.segment_pred_probs[i]); + } + } + } + + if (last_segmentation == current_picture_info.segmentation) { + writer.WriteBit(false); + return; + } + + last_segmentation = current_picture_info.segmentation; + writer.WriteBit(true); + writer.WriteBit(current_picture_info.segmentation.abs_delta != 0); + + constexpr s32 MAX_SEGMENTS = 8; + constexpr std::array SegmentationFeatureBits = {8, 6, 2, 0}; + + for (s32 i = 0; i < MAX_SEGMENTS; i++) { + auto q_enabled = current_picture_info.segmentation.feature_enabled[i][0] != 0; + writer.WriteBit(q_enabled); + if (q_enabled) { + writer.WriteS(current_picture_info.segmentation.feature_data[i][0], + SegmentationFeatureBits[0]); + } + + auto lf_enabled = current_picture_info.segmentation.feature_enabled[i][1] != 0; + writer.WriteBit(lf_enabled); + if (lf_enabled) { + writer.WriteS(current_picture_info.segmentation.feature_data[i][1], + SegmentationFeatureBits[1]); + } + + auto ref_enabled = current_picture_info.segmentation.feature_enabled[i][2] != 0; + writer.WriteBit(ref_enabled); + if (ref_enabled) { + writer.WriteU(current_picture_info.segmentation.feature_data[i][2], + SegmentationFeatureBits[2]); + } + + auto skip_enabled = current_picture_info.segmentation.feature_enabled[i][3] != 0; + writer.WriteBit(skip_enabled); + } +} + +Vp9PictureInfo VP9::GetVp9PictureInfo() { + memory_manager.ReadBlock(regs.picture_info_offset.Address(), ¤t_picture_info, + sizeof(PictureInfo)); + Vp9PictureInfo vp9_info = current_picture_info.Convert(); + + InsertEntropy(regs.vp9_prob_tab_buffer_offset.Address(), vp9_info.entropy); // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following // order: last, golden, altref, current. - std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4, - vp9_info.frame_offsets.begin()); + for (size_t i = 0; i < 4; i++) { + vp9_info.frame_offsets[i] = regs.surface_luma_offsets[i].Address(); + } return vp9_info; } void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { EntropyProbs entropy; - host1x.GMMU().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); + memory_manager.ReadBlock(offset, &entropy, sizeof(EntropyProbs)); entropy.Convert(dst); } -Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { +Vp9FrameContainer VP9::GetCurrentFrame() { Vp9FrameContainer current_frame{}; { // gpu.SyncGuestHost(); epic, why? - current_frame.info = GetVp9PictureInfo(state); + current_frame.info = GetVp9PictureInfo(); current_frame.bit_stream.resize(current_frame.info.bitstream_size); - host1x.GMMU().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), - current_frame.info.bitstream_size); + memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), + current_frame.bit_stream.data(), + current_frame.info.bitstream_size); } if (!next_frame.bit_stream.empty()) { Vp9FrameContainer temp{ @@ -742,8 +826,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); - ASSERT(!current_frame_info.segment_enabled); - uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). + WriteSegmentation(uncomp_writer); const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); @@ -770,10 +853,29 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { return uncomp_writer; } -void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { +std::tuple VP9::GetProgressiveOffsets() { + auto luma{regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto chroma{regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + return {luma, chroma}; +} + +std::tuple VP9::GetInterlacedOffsets() { + auto luma_top{regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto luma_bottom{ + regs.surface_luma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto chroma_top{ + regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + auto chroma_bottom{ + regs.surface_chroma_offsets[static_cast(Vp9SurfaceIndex::Current)].Address()}; + return {luma_top, luma_bottom, chroma_top, chroma_bottom}; +} + +std::span VP9::ComposeFrame() { + vp9_hidden_frame = false; + std::vector bitstream; { - Vp9FrameContainer curr_frame = GetCurrentFrame(state); + Vp9FrameContainer curr_frame = GetCurrentFrame(); current_frame_info = curr_frame.info; bitstream = std::move(curr_frame.bit_stream); } @@ -786,12 +888,16 @@ void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) { std::vector uncompressed_header = uncomp_writer.GetByteArray(); // Write headers and frame to buffer - frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); - std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin()); + frame_scratch.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); + std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame_scratch.begin()); std::copy(compressed_header.begin(), compressed_header.end(), - frame.begin() + uncompressed_header.size()); + frame_scratch.begin() + uncompressed_header.size()); std::copy(bitstream.begin(), bitstream.end(), - frame.begin() + uncompressed_header.size() + compressed_header.size()); + frame_scratch.begin() + uncompressed_header.size() + compressed_header.size()); + + vp9_hidden_frame = WasFrameHidden(); + + return GetFrameBytes(); } VpxRangeEncoder::VpxRangeEncoder() { @@ -944,4 +1050,4 @@ const std::vector& VpxBitStreamWriter::GetByteArray() const { return byte_array; } -} // namespace Tegra::Decoder +} // namespace Tegra::Decoders diff --git a/src/video_core/host1x/codecs/vp9.h b/src/video_core/host1x/codecs/vp9.h index f1ed19508..9d42033cb 100644 --- a/src/video_core/host1x/codecs/vp9.h +++ b/src/video_core/host1x/codecs/vp9.h @@ -10,6 +10,7 @@ #include "common/common_types.h" #include "common/scratch_buffer.h" #include "common/stream.h" +#include "video_core/host1x/codecs/decoder.h" #include "video_core/host1x/codecs/vp9_types.h" #include "video_core/host1x/nvdec_common.h" @@ -19,7 +20,7 @@ namespace Host1x { class Host1x; } // namespace Host1x -namespace Decoder { +namespace Decoders { /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the /// VP9 header bitstreams. @@ -110,21 +111,32 @@ private: std::vector byte_array; }; -class VP9 { +class VP9 final : public Decoder { public: - explicit VP9(Host1x::Host1x& host1x); - ~VP9(); + explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id, + Host1x::FrameQueue& frame_queue); + ~VP9() override; VP9(const VP9&) = delete; VP9& operator=(const VP9&) = delete; - VP9(VP9&&) = default; + VP9(VP9&&) = delete; VP9& operator=(VP9&&) = delete; - /// Composes the VP9 frame from the GPU state information. - /// Based on the official VP9 spec documentation - void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state); + [[nodiscard]] std::span ComposeFrame() override; + std::tuple GetProgressiveOffsets() override; + std::tuple GetInterlacedOffsets() override; + + bool IsInterlaced() override { + return false; + } + + std::string_view GetCurrentCodecName() const override { + return "VP9"; + } + +private: /// Returns true if the most recent frame was a hidden frame. [[nodiscard]] bool WasFrameHidden() const { return !current_frame_info.show_frame; @@ -132,10 +144,9 @@ public: /// Returns a const span to the composed frame data. [[nodiscard]] std::span GetFrameBytes() const { - return frame; + return frame_scratch; } -private: /// Generates compressed header probability updates in the bitstream writer template void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array& new_prob, @@ -167,23 +178,22 @@ private: /// Write motion vector probability updates. 6.3.17 in the spec void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + void WriteSegmentation(VpxBitStreamWriter& writer); + /// Returns VP9 information from NVDEC provided offset and size - [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo( - const Host1x::NvdecCommon::NvdecRegisters& state); + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(); /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); /// Returns frame to be decoded after buffering - [[nodiscard]] Vp9FrameContainer GetCurrentFrame( - const Host1x::NvdecCommon::NvdecRegisters& state); + [[nodiscard]] Vp9FrameContainer GetCurrentFrame(); /// Use NVDEC providied information to compose the headers for the current frame [[nodiscard]] std::vector ComposeCompressedHeader(); [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); - Host1x::Host1x& host1x; - Common::ScratchBuffer frame; + Common::ScratchBuffer frame_scratch; std::array loop_filter_ref_deltas{}; std::array loop_filter_mode_deltas{}; @@ -192,9 +202,11 @@ private: std::array frame_ctxs{}; bool swap_ref_indices{}; + Segmentation last_segmentation{}; + PictureInfo current_picture_info{}; Vp9PictureInfo current_frame_info{}; Vp9EntropyProbs prev_frame_probs{}; }; -} // namespace Decoder +} // namespace Decoders } // namespace Tegra diff --git a/src/video_core/host1x/codecs/vp9_types.h b/src/video_core/host1x/codecs/vp9_types.h index cc9b25690..77535d5f6 100644 --- a/src/video_core/host1x/codecs/vp9_types.h +++ b/src/video_core/host1x/codecs/vp9_types.h @@ -11,7 +11,14 @@ namespace Tegra { -namespace Decoder { +namespace Decoders { +enum class Vp9SurfaceIndex : u32 { + Last = 0, + Golden = 1, + AltRef = 2, + Current = 3, +}; + struct Vp9FrameDimensions { s16 width; s16 height; @@ -48,11 +55,13 @@ enum class TxMode { }; struct Segmentation { + constexpr bool operator==(const Segmentation& rhs) const = default; + u8 enabled; u8 update_map; u8 temporal_update; u8 abs_delta; - std::array feature_mask; + std::array, 8> feature_enabled; std::array, 8> feature_data; }; static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); @@ -190,7 +199,17 @@ struct PictureInfo { static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); struct EntropyProbs { - INSERT_PADDING_BYTES_NOINIT(1024); ///< 0x0000 + std::array kf_bmode_prob; ///< 0x0000 + std::array kf_bmode_probB; ///< 0x0320 + std::array ref_pred_probs; ///< 0x0384 + std::array mb_segment_tree_probs; ///< 0x0387 + std::array segment_pred_probs; ///< 0x038E + std::array ref_scores; ///< 0x0391 + std::array prob_comppred; ///< 0x0395 + INSERT_PADDING_BYTES_NOINIT(9); ///< 0x0397 + std::array kf_uv_mode_prob; ///< 0x03A0 + std::array kf_uv_mode_probB; ///< 0x03F0 + INSERT_PADDING_BYTES_NOINIT(6); ///< 0x03FA std::array inter_mode_prob; ///< 0x0400 std::array intra_inter_prob; ///< 0x041C INSERT_PADDING_BYTES_NOINIT(80); ///< 0x0420 @@ -302,5 +321,5 @@ ASSERT_POSITION(class_0_fr, 0x560); ASSERT_POSITION(coef_probs, 0x5A0); #undef ASSERT_POSITION -}; // namespace Decoder +}; // namespace Decoders }; // namespace Tegra diff --git a/src/video_core/host1x/control.cpp b/src/video_core/host1x/control.cpp index dceefdb7f..bd0ce9160 100644 --- a/src/video_core/host1x/control.cpp +++ b/src/video_core/host1x/control.cpp @@ -27,6 +27,7 @@ void Control::ProcessMethod(Method method, u32 argument) { } void Control::Execute(u32 data) { + LOG_TRACE(Service_NVDRV, "Control wait syncpt {} value {}", data, syncpoint_value); host1x.GetSyncpointManager().WaitHost(data, syncpoint_value); } diff --git a/src/video_core/host1x/control.h b/src/video_core/host1x/control.h index e117888a3..bd8a2d7ad 100644 --- a/src/video_core/host1x/control.h +++ b/src/video_core/host1x/control.h @@ -6,9 +6,7 @@ #include "common/common_types.h" -namespace Tegra { - -namespace Host1x { +namespace Tegra::Host1x { class Host1x; class Nvdec; @@ -31,10 +29,8 @@ private: /// For Host1x, execute is waiting on a syncpoint previously written into the state void Execute(u32 data); - u32 syncpoint_value{}; Host1x& host1x; + u32 syncpoint_value{}; }; -} // namespace Host1x - -} // namespace Tegra +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.cpp b/src/video_core/host1x/ffmpeg/ffmpeg.cpp index 1003cd38d..7e955223d 100644 --- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp +++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp @@ -5,7 +5,9 @@ #include "common/logging/log.h" #include "common/scope_exit.h" #include "common/settings.h" +#include "core/memory.h" #include "video_core/host1x/ffmpeg/ffmpeg.h" +#include "video_core/memory_manager.h" extern "C" { #ifdef LIBVA_FOUND @@ -149,6 +151,7 @@ bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context, } } + LOG_INFO(HW_GPU, "Hardware decoding is disabled due to implementation issues, using CPU."); return false; } @@ -183,8 +186,8 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) { return true; } -DecoderContext::DecoderContext(const Decoder& decoder) { - m_codec_context = avcodec_alloc_context3(decoder.GetCodec()); +DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} { + m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec()); av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0); m_codec_context->thread_count = 0; m_codec_context->thread_type &= ~FF_THREAD_FRAME; @@ -216,6 +219,23 @@ bool DecoderContext::OpenContext(const Decoder& decoder) { } bool DecoderContext::SendPacket(const Packet& packet) { + m_temp_frame = std::make_shared(); + m_got_frame = 0; + +// Android can randomly crash when calling decode directly, so skip. +// TODO update ffmpeg and hope that fixes it. +#ifndef ANDROID + if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) { + m_decode_order = true; + const int ret = avcodec_send_frame(m_codec_context, m_temp_frame->GetFrame()); + if (ret < 0) { + LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret)); + return false; + } + return true; + } +#endif + if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) { LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret)); return false; @@ -224,139 +244,72 @@ bool DecoderContext::SendPacket(const Packet& packet) { return true; } -std::unique_ptr DecoderContext::ReceiveFrame(bool* out_is_interlaced) { - auto dst_frame = std::make_unique(); +std::shared_ptr DecoderContext::ReceiveFrame() { + // Android can randomly crash when calling decode directly, so skip. + // TODO update ffmpeg and hope that fixes it. +#ifndef ANDROID + if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) { + m_decode_order = true; + int ret{0}; - const auto ReceiveImpl = [&](AVFrame* frame) { - if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) { - LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret)); - return false; + if (m_got_frame == 0) { + Packet packet{{}}; + auto* pkt = packet.GetPacket(); + pkt->data = nullptr; + pkt->size = 0; + ret = avcodec_receive_packet(m_codec_context, pkt); + m_codec_context->has_b_frames = 0; } - *out_is_interlaced = -#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59 - (frame->flags & AV_FRAME_FLAG_INTERLACED) != 0; -#else - frame->interlaced_frame != 0; + if (m_got_frame == 0 || ret < 0) { + LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret); + return {}; + } + } else #endif - return true; - }; + { - if (m_codec_context->hw_device_ctx) { - // If we have a hardware context, make a separate frame here to receive the - // hardware result before sending it to the output. - Frame intermediate_frame; + const auto ReceiveImpl = [&](AVFrame* frame) { + if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) { + LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret)); + return false; + } - if (!ReceiveImpl(intermediate_frame.GetFrame())) { - return {}; - } + return true; + }; - dst_frame->SetFormat(PreferredGpuFormat); - if (const int ret = - av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0); - ret < 0) { - LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret)); - return {}; - } - } else { - // Otherwise, decode the frame as normal. - if (!ReceiveImpl(dst_frame->GetFrame())) { - return {}; + if (m_codec_context->hw_device_ctx) { + // If we have a hardware context, make a separate frame here to receive the + // hardware result before sending it to the output. + Frame intermediate_frame; + + if (!ReceiveImpl(intermediate_frame.GetFrame())) { + return {}; + } + + m_temp_frame->SetFormat(PreferredGpuFormat); + if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(), + intermediate_frame.GetFrame(), 0); + ret < 0) { + LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret)); + return {}; + } + } else { + // Otherwise, decode the frame as normal. + if (!ReceiveImpl(m_temp_frame->GetFrame())) { + return {}; + } } } - return dst_frame; -} - -DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) { - const AVFilter* buffer_src = avfilter_get_by_name("buffer"); - const AVFilter* buffer_sink = avfilter_get_by_name("buffersink"); - AVFilterInOut* inputs = avfilter_inout_alloc(); - AVFilterInOut* outputs = avfilter_inout_alloc(); - SCOPE_EXIT { - avfilter_inout_free(&inputs); - avfilter_inout_free(&outputs); - }; - - // Don't know how to get the accurate time_base but it doesn't matter for yadif filter - // so just use 1/1 to make buffer filter happy - std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(), - frame.GetHeight(), static_cast(frame.GetPixelFormat())); - - m_filter_graph = avfilter_graph_alloc(); - int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(), - nullptr, m_filter_graph); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret)); - return; - } - - ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr, - m_filter_graph); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret)); - return; - } - - inputs->name = av_strdup("out"); - inputs->filter_ctx = m_sink_context; - inputs->pad_idx = 0; - inputs->next = nullptr; - - outputs->name = av_strdup("in"); - outputs->filter_ctx = m_source_context; - outputs->pad_idx = 0; - outputs->next = nullptr; - - const char* description = "yadif=1:-1:0"; - ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret)); - return; - } - - ret = avfilter_graph_config(m_filter_graph, nullptr); - if (ret < 0) { - LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret)); - return; - } - - m_initialized = true; -} - -bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) { - if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(), - AV_BUFFERSRC_FLAG_KEEP_REF); - ret < 0) { - LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret)); - return false; - } - - return true; -} - -std::unique_ptr DeinterlaceFilter::DrainSinkFrame() { - auto dst_frame = std::make_unique(); - const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame()); - - if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) { - return {}; - } - - if (ret < 0) { - LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret)); - return {}; - } - - return dst_frame; -} - -DeinterlaceFilter::~DeinterlaceFilter() { - avfilter_graph_free(&m_filter_graph); +#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59 + m_temp_frame->GetFrame()->interlaced_frame = + (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) != 0; +#endif + return std::move(m_temp_frame); } void DecodeApi::Reset() { - m_deinterlace_filter.reset(); m_hardware_context.reset(); m_decoder_context.reset(); m_decoder.reset(); @@ -382,43 +335,14 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) { return true; } -bool DecodeApi::SendPacket(std::span packet_data, size_t configuration_size) { +bool DecodeApi::SendPacket(std::span packet_data) { FFmpeg::Packet packet(packet_data); return m_decoder_context->SendPacket(packet); } -void DecodeApi::ReceiveFrames(std::queue>& frame_queue) { +std::shared_ptr DecodeApi::ReceiveFrame() { // Receive raw frame from decoder. - bool is_interlaced; - auto frame = m_decoder_context->ReceiveFrame(&is_interlaced); - if (!frame) { - return; - } - - if (!is_interlaced) { - // If the frame is not interlaced, we can pend it now. - frame_queue.push(std::move(frame)); - } else { - // Create the deinterlacer if needed. - if (!m_deinterlace_filter) { - m_deinterlace_filter.emplace(*frame); - } - - // Add the frame we just received. - if (!m_deinterlace_filter->AddSourceFrame(*frame)) { - return; - } - - // Pend output fields. - while (true) { - auto filter_frame = m_deinterlace_filter->DrainSinkFrame(); - if (!filter_frame) { - break; - } - - frame_queue.push(std::move(filter_frame)); - } - } + return m_decoder_context->ReceiveFrame(); } } // namespace FFmpeg diff --git a/src/video_core/host1x/ffmpeg/ffmpeg.h b/src/video_core/host1x/ffmpeg/ffmpeg.h index 1de0bbd83..bd0ad97ad 100644 --- a/src/video_core/host1x/ffmpeg/ffmpeg.h +++ b/src/video_core/host1x/ffmpeg/ffmpeg.h @@ -20,17 +20,20 @@ extern "C" { #endif #include -#include -#include -#include -#include #include +#ifndef ANDROID +#include +#endif #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif } +namespace Tegra { +class MemoryManager; +} + namespace FFmpeg { class Packet; @@ -90,6 +93,10 @@ public: return m_frame->data[plane]; } + const u8* GetPlane(int plane) const { + return m_frame->data[plane]; + } + u8** GetPlanes() const { return m_frame->data; } @@ -98,6 +105,14 @@ public: m_frame->format = format; } + bool IsInterlaced() const { + return m_frame->interlaced_frame != 0; + } + + bool IsHardwareDecoded() const { + return m_frame->hw_frames_ctx != nullptr; + } + AVFrame* GetFrame() const { return m_frame; } @@ -160,33 +175,22 @@ public: void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt); bool OpenContext(const Decoder& decoder); bool SendPacket(const Packet& packet); - std::unique_ptr ReceiveFrame(bool* out_is_interlaced); + std::shared_ptr ReceiveFrame(); AVCodecContext* GetCodecContext() const { return m_codec_context; } + bool UsingDecodeOrder() const { + return m_decode_order; + } + private: + const Decoder& m_decoder; AVCodecContext* m_codec_context{}; -}; - -// Wraps an AVFilterGraph. -class DeinterlaceFilter { -public: - YUZU_NON_COPYABLE(DeinterlaceFilter); - YUZU_NON_MOVEABLE(DeinterlaceFilter); - - explicit DeinterlaceFilter(const Frame& frame); - ~DeinterlaceFilter(); - - bool AddSourceFrame(const Frame& frame); - std::unique_ptr DrainSinkFrame(); - -private: - AVFilterGraph* m_filter_graph{}; - AVFilterContext* m_source_context{}; - AVFilterContext* m_sink_context{}; - bool m_initialized{}; + s32 m_got_frame{}; + std::shared_ptr m_temp_frame{}; + bool m_decode_order{}; }; class DecodeApi { @@ -200,14 +204,17 @@ public: bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec); void Reset(); - bool SendPacket(std::span packet_data, size_t configuration_size); - void ReceiveFrames(std::queue>& frame_queue); + bool UsingDecodeOrder() const { + return m_decoder_context->UsingDecodeOrder(); + } + + bool SendPacket(std::span packet_data); + std::shared_ptr ReceiveFrame(); private: std::optional m_decoder; std::optional m_decoder_context; std::optional m_hardware_context; - std::optional m_deinterlace_filter; }; } // namespace FFmpeg diff --git a/src/video_core/host1x/host1x.cpp b/src/video_core/host1x/host1x.cpp index e923bfa22..293bca6d7 100644 --- a/src/video_core/host1x/host1x.cpp +++ b/src/video_core/host1x/host1x.cpp @@ -3,10 +3,10 @@ #include "core/core.h" #include "video_core/host1x/host1x.h" +#include "video_core/host1x/nvdec.h" +#include "video_core/host1x/vic.h" -namespace Tegra { - -namespace Host1x { +namespace Tegra::Host1x { Host1x::Host1x(Core::System& system_) : system{system_}, syncpoint_manager{}, @@ -15,6 +15,22 @@ Host1x::Host1x(Core::System& system_) Host1x::~Host1x() = default; -} // namespace Host1x +void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) { + switch (type) { + case ChannelType::NvDec: + devices[fd] = std::make_unique(*this, fd, syncpt, frame_queue); + break; + case ChannelType::VIC: + devices[fd] = std::make_unique(*this, fd, syncpt, frame_queue); + break; + default: + LOG_ERROR(HW_GPU, "Unimplemented host1x device {}", static_cast(type)); + break; + } +} -} // namespace Tegra +void Host1x::StopDevice(s32 fd, ChannelType type) { + devices.erase(fd); +} + +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/host1x.h b/src/video_core/host1x/host1x.h index d72d97b7b..8debac93d 100644 --- a/src/video_core/host1x/host1x.h +++ b/src/video_core/host1x/host1x.h @@ -3,9 +3,14 @@ #pragma once +#include +#include +#include + #include "common/common_types.h" #include "common/address_space.h" +#include "video_core/cdma_pusher.h" #include "video_core/host1x/gpu_device_memory_manager.h" #include "video_core/host1x/syncpoint_manager.h" #include "video_core/memory_manager.h" @@ -14,15 +19,137 @@ namespace Core { class System; } // namespace Core -namespace Tegra { +namespace FFmpeg { +class Frame; +} // namespace FFmpeg -namespace Host1x { +namespace Tegra::Host1x { +class Nvdec; + +class FrameQueue { +public: + void Open(s32 fd) { + std::scoped_lock l{m_mutex}; + m_presentation_order.insert({fd, {}}); + m_decode_order.insert({fd, {}}); + } + + void Close(s32 fd) { + std::scoped_lock l{m_mutex}; + m_presentation_order.erase(fd); + m_decode_order.erase(fd); + } + + s32 VicFindNvdecFdFromOffset(u64 search_offset) { + std::scoped_lock l{m_mutex}; + // Vic does not know which nvdec is producing frames for it, so search all the fds here for + // the given offset. + for (auto& map : m_presentation_order) { + for (auto& [offset, frame] : map.second) { + if (offset == search_offset) { + return map.first; + } + } + } + + for (auto& map : m_decode_order) { + for (auto& [offset, frame] : map.second) { + if (offset == search_offset) { + return map.first; + } + } + } + + return -1; + } + + void PushPresentOrder(s32 fd, u64 offset, std::shared_ptr&& frame) { + std::scoped_lock l{m_mutex}; + auto map = m_presentation_order.find(fd); + if (map == m_presentation_order.end()) { + return; + } + map->second.emplace_back(offset, std::move(frame)); + } + + void PushDecodeOrder(s32 fd, u64 offset, std::shared_ptr&& frame) { + std::scoped_lock l{m_mutex}; + auto map = m_decode_order.find(fd); + if (map == m_decode_order.end()) { + return; + } + map->second.insert_or_assign(offset, std::move(frame)); + } + + std::shared_ptr GetFrame(s32 fd, u64 offset) { + if (fd == -1) { + return {}; + } + + std::scoped_lock l{m_mutex}; + auto present_map = m_presentation_order.find(fd); + if (present_map != m_presentation_order.end() && present_map->second.size() > 0) { + return GetPresentOrderLocked(fd); + } + + auto decode_map = m_decode_order.find(fd); + if (decode_map != m_decode_order.end() && decode_map->second.size() > 0) { + return GetDecodeOrderLocked(fd, offset); + } + + return {}; + } + +private: + std::shared_ptr GetPresentOrderLocked(s32 fd) { + auto map = m_presentation_order.find(fd); + if (map == m_presentation_order.end() || map->second.size() == 0) { + return {}; + } + auto frame = std::move(map->second.front().second); + map->second.pop_front(); + return frame; + } + + std::shared_ptr GetDecodeOrderLocked(s32 fd, u64 offset) { + auto map = m_decode_order.find(fd); + if (map == m_decode_order.end() || map->second.size() == 0) { + return {}; + } + auto it = map->second.find(offset); + if (it == map->second.end()) { + return {}; + } + return std::move(map->second.extract(it).mapped()); + } + + using FramePtr = std::shared_ptr; + + std::mutex m_mutex{}; + std::unordered_map>> m_presentation_order; + std::unordered_map> m_decode_order; +}; + +enum class ChannelType : u32 { + MsEnc = 0, + VIC = 1, + GPU = 2, + NvDec = 3, + Display = 4, + NvJpg = 5, + TSec = 6, + Max = 7, +}; class Host1x { public: explicit Host1x(Core::System& system); ~Host1x(); + Core::System& System() { + return system; + } + SyncpointManager& GetSyncpointManager() { return syncpoint_manager; } @@ -55,14 +182,25 @@ public: return *allocator; } + void StartDevice(s32 fd, ChannelType type, u32 syncpt); + void StopDevice(s32 fd, ChannelType type); + + void PushEntries(s32 fd, ChCommandHeaderList&& entries) { + auto it = devices.find(fd); + if (it == devices.end()) { + return; + } + it->second->PushEntries(std::move(entries)); + } + private: Core::System& system; SyncpointManager syncpoint_manager; Tegra::MaxwellDeviceMemoryManager memory_manager; Tegra::MemoryManager gmmu_manager; std::unique_ptr> allocator; + FrameQueue frame_queue; + std::unordered_map> devices; }; -} // namespace Host1x - -} // namespace Tegra +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/nvdec.cpp b/src/video_core/host1x/nvdec.cpp index b8f5866d3..741a7d5c1 100644 --- a/src/video_core/host1x/nvdec.cpp +++ b/src/video_core/host1x/nvdec.cpp @@ -2,6 +2,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" + +#include "common/polyfill_thread.h" +#include "common/settings.h" +#include "video_core/host1x/codecs/h264.h" +#include "video_core/host1x/codecs/vp8.h" +#include "video_core/host1x/codecs/vp9.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" @@ -10,37 +16,69 @@ namespace Tegra::Host1x { #define NVDEC_REG_INDEX(field_name) \ (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64)) -Nvdec::Nvdec(Host1x& host1x_) - : host1x(host1x_), state{}, codec(std::make_unique(host1x, state)) {} +Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_) + : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, frame_queue{frame_queue_} { + LOG_INFO(HW_GPU, "Created nvdec {}", id); + frame_queue.Open(id); +} -Nvdec::~Nvdec() = default; +Nvdec::~Nvdec() { + LOG_INFO(HW_GPU, "Destroying nvdec {}", id); +} void Nvdec::ProcessMethod(u32 method, u32 argument) { - state.reg_array[method] = static_cast(argument) << 8; + regs.reg_array[method] = argument; switch (method) { case NVDEC_REG_INDEX(set_codec_id): - codec->SetTargetCodec(static_cast(argument)); + CreateDecoder(static_cast(argument)); break; - case NVDEC_REG_INDEX(execute): + case NVDEC_REG_INDEX(execute): { + if (wait_needed) { + std::this_thread::sleep_for(std::chrono::milliseconds(32)); + wait_needed = false; + } Execute(); - break; + } break; } } -std::unique_ptr Nvdec::GetFrame() { - return codec->GetCurrentFrame(); +void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) { + if (decoder.get()) { + return; + } + switch (codec) { + case NvdecCommon::VideoCodec::H264: + decoder = std::make_unique(host1x, regs, id, frame_queue); + break; + case NvdecCommon::VideoCodec::VP8: + decoder = std::make_unique(host1x, regs, id, frame_queue); + break; + case NvdecCommon::VideoCodec::VP9: + decoder = std::make_unique(host1x, regs, id, frame_queue); + break; + default: + UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName()); + break; + } + LOG_INFO(HW_GPU, "Created decoder {} for id {}", decoder->GetCurrentCodecName(), id); } void Nvdec::Execute() { - switch (codec->GetCurrentCodec()) { + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { + // Signalling syncpts too fast can cause games to get stuck as they don't expect a <1ms + // execution time. Sleep for half of a 60 fps frame just in case. + std::this_thread::sleep_for(std::chrono::milliseconds(8)); + return; + } + switch (decoder->GetCurrentCodec()) { case NvdecCommon::VideoCodec::H264: case NvdecCommon::VideoCodec::VP8: case NvdecCommon::VideoCodec::VP9: - codec->Decode(); + decoder->Decode(); break; default: - UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName()); + UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName()); break; } } diff --git a/src/video_core/host1x/nvdec.h b/src/video_core/host1x/nvdec.h index ddddb8d28..565c65f66 100644 --- a/src/video_core/host1x/nvdec.h +++ b/src/video_core/host1x/nvdec.h @@ -5,33 +5,47 @@ #include #include + #include "common/common_types.h" -#include "video_core/host1x/codecs/codec.h" +#include "video_core/cdma_pusher.h" +#include "video_core/host1x/codecs/decoder.h" namespace Tegra { namespace Host1x { - class Host1x; +class FrameQueue; -class Nvdec { +class Nvdec final : public CDmaPusher { public: - explicit Nvdec(Host1x& host1x); + explicit Nvdec(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue_); ~Nvdec(); /// Writes the method into the state, Invoke Execute() if encountered - void ProcessMethod(u32 method, u32 argument); + void ProcessMethod(u32 method, u32 arg) override; - /// Return most recently decoded frame - [[nodiscard]] std::unique_ptr GetFrame(); + u32 GetSyncpoint() const { + return syncpoint; + } + + void SetWait() { + wait_needed = true; + } private: + /// Create the decoder when the codec id is set + void CreateDecoder(NvdecCommon::VideoCodec codec); + /// Invoke codec to decode a frame void Execute(); - Host1x& host1x; - NvdecCommon::NvdecRegisters state; - std::unique_ptr codec; + s32 id; + u32 syncpoint; + FrameQueue& frame_queue; + + NvdecCommon::NvdecRegisters regs{}; + std::unique_ptr decoder; + bool wait_needed{false}; }; } // namespace Host1x diff --git a/src/video_core/host1x/nvdec_common.h b/src/video_core/host1x/nvdec_common.h index 49d67ebbe..dfd8bb377 100644 --- a/src/video_core/host1x/nvdec_common.h +++ b/src/video_core/host1x/nvdec_common.h @@ -17,6 +17,17 @@ enum class VideoCodec : u64 { VP9 = 0x9, }; +struct Offset { + constexpr u64 Address() const noexcept { + return offset << 8; + } + +private: + u64 offset; +}; +static_assert(std::is_trivial_v, "Offset must be trivial"); +static_assert(sizeof(Offset) == 0x8, "Offset has the wrong size!"); + // NVDEC should use a 32-bit address space, but is mapped to 64-bit, // doubling the sizes here is compensating for that. struct NvdecRegisters { @@ -38,29 +49,40 @@ struct NvdecRegisters { BitField<17, 1, u64> all_intra_frame; }; } control_params; - u64 picture_info_offset; ///< 0x0808 - u64 frame_bitstream_offset; ///< 0x0810 - u64 frame_number; ///< 0x0818 - u64 h264_slice_data_offsets; ///< 0x0820 - u64 h264_mv_dump_offset; ///< 0x0828 - INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 - u64 frame_stats_offset; ///< 0x0848 - u64 h264_last_surface_luma_offset; ///< 0x0850 - u64 h264_last_surface_chroma_offset; ///< 0x0858 - std::array surface_luma_offset; ///< 0x0860 - std::array surface_chroma_offset; ///< 0x08E8 - INSERT_PADDING_WORDS_NOINIT(68); ///< 0x0970 - u64 vp8_prob_data_offset; ///< 0x0A80 - u64 vp8_header_partition_buf_offset; ///< 0x0A88 - INSERT_PADDING_WORDS_NOINIT(60); ///< 0x0A90 - u64 vp9_entropy_probs_offset; ///< 0x0B80 - u64 vp9_backward_updates_offset; ///< 0x0B88 - u64 vp9_last_frame_segmap_offset; ///< 0x0B90 - u64 vp9_curr_frame_segmap_offset; ///< 0x0B98 - INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BA0 - u64 vp9_last_frame_mvs_offset; ///< 0x0BA8 - u64 vp9_curr_frame_mvs_offset; ///< 0x0BB0 - INSERT_PADDING_WORDS_NOINIT(2); ///< 0x0BB8 + Offset picture_info_offset; ///< 0x0808 + Offset frame_bitstream_offset; ///< 0x0810 + u64 frame_number; ///< 0x0818 + Offset h264_slice_data_offsets; ///< 0x0820 + Offset h264_mv_dump_offset; ///< 0x0828 + INSERT_PADDING_WORDS_NOINIT(6); ///< 0x0830 + Offset frame_stats_offset; ///< 0x0848 + Offset h264_last_surface_luma_offset; ///< 0x0850 + Offset h264_last_surface_chroma_offset; ///< 0x0858 + std::array surface_luma_offsets; ///< 0x0860 + std::array surface_chroma_offsets; ///< 0x08E8 + Offset pic_scratch_buf_offset; ///< 0x0970 + Offset external_mvbuffer_offset; ///< 0x0978 + INSERT_PADDING_WORDS_NOINIT(32); ///< 0x0980 + Offset h264_mbhist_buffer_offset; ///< 0x0A00 + INSERT_PADDING_WORDS_NOINIT(30); ///< 0x0A08 + Offset vp8_prob_data_offset; ///< 0x0A80 + Offset vp8_header_partition_buf_offset; ///< 0x0A88 + INSERT_PADDING_WORDS_NOINIT(28); ///< 0x0A90 + Offset hvec_scalist_list_offset; ///< 0x0B00 + Offset hvec_tile_sizes_offset; ///< 0x0B08 + Offset hvec_filter_buffer_offset; ///< 0x0B10 + Offset hvec_sao_buffer_offset; ///< 0x0B18 + Offset hvec_slice_info_buffer_offset; ///< 0x0B20 + Offset hvec_slice_group_index_offset; ///< 0x0B28 + INSERT_PADDING_WORDS_NOINIT(20); ///< 0x0B30 + Offset vp9_prob_tab_buffer_offset; ///< 0x0B80 + Offset vp9_ctx_counter_buffer_offset; ///< 0x0B88 + Offset vp9_segment_read_buffer_offset; ///< 0x0B90 + Offset vp9_segment_write_buffer_offset; ///< 0x0B98 + Offset vp9_tile_size_buffer_offset; ///< 0x0BA0 + Offset vp9_col_mvwrite_buffer_offset; ///< 0x0BA8 + Offset vp9_col_mvread_buffer_offset; ///< 0x0BB0 + Offset vp9_filter_buffer_offset; ///< 0x0BB8 }; std::array reg_array; }; @@ -81,16 +103,16 @@ ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104); ASSERT_REG_POSITION(frame_stats_offset, 0x109); ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A); ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B); -ASSERT_REG_POSITION(surface_luma_offset, 0x10C); -ASSERT_REG_POSITION(surface_chroma_offset, 0x11D); +ASSERT_REG_POSITION(surface_luma_offsets, 0x10C); +ASSERT_REG_POSITION(surface_chroma_offsets, 0x11D); ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150); ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151); -ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170); -ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171); -ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172); -ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173); -ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175); -ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176); +ASSERT_REG_POSITION(vp9_prob_tab_buffer_offset, 0x170); +ASSERT_REG_POSITION(vp9_ctx_counter_buffer_offset, 0x171); +ASSERT_REG_POSITION(vp9_segment_read_buffer_offset, 0x172); +ASSERT_REG_POSITION(vp9_segment_write_buffer_offset, 0x173); +ASSERT_REG_POSITION(vp9_col_mvwrite_buffer_offset, 0x175); +ASSERT_REG_POSITION(vp9_col_mvread_buffer_offset, 0x176); #undef ASSERT_REG_POSITION diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp index 8f23ce527..8f51c92af 100644 --- a/src/video_core/host1x/syncpoint_manager.cpp +++ b/src/video_core/host1x/syncpoint_manager.cpp @@ -18,7 +18,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction( return {}; } - std::unique_lock lk(guard); + std::scoped_lock lk(guard); if (syncpoint.load(std::memory_order_relaxed) >= expected_value) { action(); return {}; @@ -35,7 +35,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction( void SyncpointManager::DeregisterAction(std::list& action_storage, const ActionHandle& handle) { - std::unique_lock lk(guard); + std::scoped_lock lk(guard); // We want to ensure the iterator still exists prior to erasing it // Otherwise, if an invalid iterator was passed in then it could lead to UB @@ -78,7 +78,7 @@ void SyncpointManager::Increment(std::atomic& syncpoint, std::condition_var std::list& action_storage) { auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1}; - std::unique_lock lk(guard); + std::scoped_lock lk(guard); auto it = action_storage.begin(); while (it != action_storage.end()) { if (it->expected_value > new_value) { diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp index d154746af..3ad56bb80 100644 --- a/src/video_core/host1x/vic.cpp +++ b/src/video_core/host1x/vic.cpp @@ -2,6 +2,21 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include +#include + +#if defined(ARCHITECTURE_x86_64) +#if defined(_MSC_VER) +#include +#else +#include +#endif +#elif defined(ARCHITECTURE_arm64) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-int-conversion" +#include +#pragma GCC diagnostic pop +#endif extern "C" { #if defined(__GNUC__) || defined(__clang__) @@ -14,228 +29,1231 @@ extern "C" { #endif } +#include "common/alignment.h" #include "common/assert.h" #include "common/bit_field.h" #include "common/logging/log.h" +#include "common/polyfill_thread.h" +#include "common/settings.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/guest_memory.h" #include "video_core/host1x/host1x.h" #include "video_core/host1x/nvdec.h" #include "video_core/host1x/vic.h" #include "video_core/memory_manager.h" #include "video_core/textures/decoders.h" -namespace Tegra { - -namespace Host1x { +#if defined(ARCHITECTURE_x86_64) +#include "common/x64/cpu_detect.h" +#endif +namespace Tegra::Host1x { namespace { -enum class VideoPixelFormat : u64_le { - RGBA8 = 0x1f, - BGRA8 = 0x20, - RGBX8 = 0x23, - YUV420 = 0x44, -}; -} // Anonymous namespace +static bool HasSSE41() { +#if defined(ARCHITECTURE_x86_64) + const auto& cpu_caps{Common::GetCPUCaps()}; + return cpu_caps.sse4_1; +#else + return false; +#endif +} -union VicConfig { - u64_le raw{}; - BitField<0, 7, VideoPixelFormat> pixel_format; - BitField<7, 2, u64_le> chroma_loc_horiz; - BitField<9, 2, u64_le> chroma_loc_vert; - BitField<11, 4, u64_le> block_linear_kind; - BitField<15, 4, u64_le> block_linear_height_log2; - BitField<32, 14, u64_le> surface_width_minus1; - BitField<46, 14, u64_le> surface_height_minus1; -}; +void SwizzleSurface(std::span output, u32 out_stride, std::span input, u32 in_stride, + u32 height) { + /* + * Taken from https://github.com/averne/FFmpeg/blob/nvtegra/libavutil/hwcontext_nvtegra.c#L949 + * Can only handle block height == 1. + */ + const uint32_t x_mask = 0xFFFFFFD2u; + const uint32_t y_mask = 0x2Cu; + uint32_t offs_x{}; + uint32_t offs_y{}; + uint32_t offs_line{}; -Vic::Vic(Host1x& host1x_, std::shared_ptr nvdec_processor_) - : host1x(host1x_), - nvdec_processor(std::move(nvdec_processor_)), converted_frame_buffer{nullptr, av_free} {} + for (u32 y = 0; y < height; y += 2) { + auto dst_line = output.data() + offs_y * 16; + const auto src_line = input.data() + y * (in_stride / 16) * 16; -Vic::~Vic() = default; + offs_line = offs_x; + for (u32 x = 0; x < in_stride; x += 16) { + std::memcpy(&dst_line[offs_line * 16], &src_line[x], 16); + std::memcpy(&dst_line[offs_line * 16 + 16], &src_line[x + in_stride], 16); + offs_line = (offs_line - x_mask) & x_mask; + } -void Vic::ProcessMethod(Method method, u32 argument) { - LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast(method)); - const u64 arg = static_cast(argument) << 8; - switch (method) { - case Method::Execute: + offs_y = (offs_y - y_mask) & y_mask; + + /* Wrap into next tile row */ + if (!offs_y) { + offs_x += out_stride; + } + } +} + +} // namespace + +Vic::Vic(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_) + : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, + frame_queue{frame_queue_}, has_sse41{HasSSE41()} { + LOG_INFO(HW_GPU, "Created vic {}", id); +} + +Vic::~Vic() { + LOG_INFO(HW_GPU, "Destroying vic {}", id); + frame_queue.Close(id); +} + +void Vic::ProcessMethod(u32 method, u32 arg) { + LOG_TRACE(HW_GPU, "Vic {} method 0x{:X}", id, static_cast(method)); + regs.reg_array[method] = arg; + + switch (static_cast(method * sizeof(u32))) { + case Method::Execute: { Execute(); - break; - case Method::SetConfigStructOffset: - config_struct_address = arg; - break; - case Method::SetOutputSurfaceLumaOffset: - output_surface_luma_address = arg; - break; - case Method::SetOutputSurfaceChromaOffset: - output_surface_chroma_address = arg; - break; + } break; default: break; } } void Vic::Execute() { - if (output_surface_luma_address == 0) { - LOG_ERROR(Service_NVDRV, "VIC Luma address not set."); - return; - } - const VicConfig config{host1x.GMMU().Read(config_struct_address + 0x20)}; - auto frame = nvdec_processor->GetFrame(); - if (!frame) { - return; - } - const u64 surface_width = config.surface_width_minus1 + 1; - const u64 surface_height = config.surface_height_minus1 + 1; - if (static_cast(frame->GetWidth()) != surface_width || - static_cast(frame->GetHeight()) != surface_height) { - // TODO: Properly support multiple video streams with differing frame dimensions - LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}", - frame->GetWidth(), frame->GetHeight(), surface_width, surface_height); - } - switch (config.pixel_format) { - case VideoPixelFormat::RGBA8: - case VideoPixelFormat::BGRA8: - case VideoPixelFormat::RGBX8: - WriteRGBFrame(std::move(frame), config); - break; - case VideoPixelFormat::YUV420: - WriteYUVFrame(std::move(frame), config); - break; - default: - UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value()); - break; - } -} + ConfigStruct config{}; + memory_manager.ReadBlock(regs.config_struct_offset.Address(), &config, sizeof(ConfigStruct)); -void Vic::WriteRGBFrame(std::unique_ptr frame, const VicConfig& config) { - LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + auto output_width{config.output_surface_config.out_surface_width + 1}; + auto output_height{config.output_surface_config.out_surface_height + 1}; + output_surface.resize_destructive(output_width * output_height); - const auto frame_width = frame->GetWidth(); - const auto frame_height = frame->GetHeight(); - const auto frame_format = frame->GetPixelFormat(); - - if (!scaler_ctx || frame_width != scaler_width || frame_height != scaler_height) { - const AVPixelFormat target_format = [pixel_format = config.pixel_format]() { - switch (pixel_format) { - case VideoPixelFormat::RGBA8: - return AV_PIX_FMT_RGBA; - case VideoPixelFormat::BGRA8: - return AV_PIX_FMT_BGRA; - case VideoPixelFormat::RGBX8: - return AV_PIX_FMT_RGB0; - default: - return AV_PIX_FMT_RGBA; - } - }(); - - sws_freeContext(scaler_ctx); - // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format - scaler_ctx = sws_getContext(frame_width, frame_height, frame_format, frame_width, - frame_height, target_format, 0, nullptr, nullptr, nullptr); - scaler_width = frame_width; - scaler_height = frame_height; - converted_frame_buffer.reset(); - } - if (!converted_frame_buffer) { - const size_t frame_size = frame_width * frame_height * 4; - converted_frame_buffer = AVMallocPtr{static_cast(av_malloc(frame_size)), av_free}; - } - const std::array converted_stride{frame_width * 4, frame_height * 4, 0, 0}; - u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; - sws_scale(scaler_ctx, frame->GetPlanes(), frame->GetStrides(), 0, frame_height, - &converted_frame_buf_addr, converted_stride.data()); - - // Use the minimum of surface/frame dimensions to avoid buffer overflow. - const u32 surface_width = static_cast(config.surface_width_minus1) + 1; - const u32 surface_height = static_cast(config.surface_height_minus1) + 1; - const u32 width = std::min(surface_width, static_cast(frame_width)); - const u32 height = std::min(surface_height, static_cast(frame_height)); - const u32 blk_kind = static_cast(config.block_linear_kind); - if (blk_kind != 0) { - // swizzle pitch linear to block linear - const u32 block_height = static_cast(config.block_linear_height_log2); - const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); - luma_buffer.resize_destructive(size); - std::span frame_buff(converted_frame_buf_addr, 4 * width * height); - Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1, 0, 0, width, height, - block_height, 0, width * 4); - - host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); + if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] { + // Fill the frame with black, as otherwise they can have random data and be very glitchy. + std::fill(output_surface.begin(), output_surface.end(), Pixel{}); } else { - // send pitch linear frame - const size_t linear_size = width * height * 4; - host1x.GMMU().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, - linear_size); + for (size_t i = 0; i < config.slot_structs.size(); i++) { + auto& slot_config{config.slot_structs[i]}; + if (!slot_config.config.slot_enable) { + continue; + } + + auto luma_offset{regs.surfaces[i][SurfaceIndex::Current].luma.Address()}; + if (nvdec_id == -1) { + nvdec_id = frame_queue.VicFindNvdecFdFromOffset(luma_offset); + } + + auto frame = frame_queue.GetFrame(nvdec_id, luma_offset); + if (!frame.get()) { + LOG_ERROR(HW_GPU, "Vic {} failed to get frame with offset 0x{:X}", id, luma_offset); + continue; + } + + switch (frame->GetPixelFormat()) { + case AV_PIX_FMT_YUV420P: + ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); + break; + case AV_PIX_FMT_NV12: + ReadY8__V8U8_N420(slot_config, regs.surfaces[i], std::move(frame)); + break; + default: + UNIMPLEMENTED_MSG( + "Unimplemented slot pixel format {}", + static_cast(slot_config.surface_config.slot_pixel_format.Value())); + break; + } + + Blend(config, slot_config); + } + } + + switch (config.output_surface_config.out_pixel_format) { + case VideoPixelFormat::A8B8G8R8: + case VideoPixelFormat::X8B8G8R8: + WriteABGR(config.output_surface_config); + break; + case VideoPixelFormat::A8R8G8B8: + WriteABGR(config.output_surface_config); + break; + case VideoPixelFormat::Y8__V8U8_N420: + WriteY8__V8U8_N420(config.output_surface_config); + break; + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {}", + config.output_surface_config.out_pixel_format.Value()); + break; } } -void Vic::WriteYUVFrame(std::unique_ptr frame, const VicConfig& config) { - LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); +template +void Vic::ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, + std::span offsets, + std::shared_ptr frame) { + const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; + auto out_luma_height{slot.surface_config.slot_surface_height + 1}; + const auto out_luma_stride{out_luma_width}; - const std::size_t surface_width = config.surface_width_minus1 + 1; - const std::size_t surface_height = config.surface_height_minus1 + 1; - const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; - // Use the minimum of surface/frame dimensions to avoid buffer overflow. - const auto frame_width = std::min(surface_width, static_cast(frame->GetWidth())); - const auto frame_height = std::min(surface_height, static_cast(frame->GetHeight())); - - const auto stride = static_cast(frame->GetStride(0)); - - luma_buffer.resize_destructive(aligned_width * surface_height); - chroma_buffer.resize_destructive(aligned_width * surface_height / 2); - - // Populate luma buffer - const u8* luma_src = frame->GetData(0); - for (std::size_t y = 0; y < frame_height; ++y) { - const std::size_t src = y * stride; - const std::size_t dst = y * aligned_width; - std::memcpy(luma_buffer.data() + dst, luma_src + src, frame_width); + if constexpr (Interlaced) { + out_luma_height *= 2; } - host1x.GMMU().WriteBlock(output_surface_luma_address, luma_buffer.data(), luma_buffer.size()); - // Chroma - const std::size_t half_height = frame_height / 2; - const auto half_stride = static_cast(frame->GetStride(1)); + slot_surface.resize_destructive(out_luma_width * out_luma_height); - switch (frame->GetPixelFormat()) { - case AV_PIX_FMT_YUV420P: { - // Frame from FFmpeg software - // Populate chroma buffer from both channels with interleaving. - const std::size_t half_width = frame_width / 2; - u8* chroma_buffer_data = chroma_buffer.data(); - const u8* chroma_b_src = frame->GetData(1); - const u8* chroma_r_src = frame->GetData(2); - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * half_stride; - const std::size_t dst = y * aligned_width; - for (std::size_t x = 0; x < half_width; ++x) { - chroma_buffer_data[dst + x * 2] = chroma_b_src[src + x]; - chroma_buffer_data[dst + x * 2 + 1] = chroma_r_src[src + x]; + const auto in_luma_width{std::min(frame->GetWidth(), static_cast(out_luma_width))}; + const auto in_luma_height{std::min(frame->GetHeight(), static_cast(out_luma_height))}; + const auto in_luma_stride{frame->GetStride(0)}; + + const auto in_chroma_stride{frame->GetStride(1)}; + + const auto* luma_buffer{frame->GetPlane(0)}; + const auto* chroma_u_buffer{frame->GetPlane(1)}; + const auto* chroma_v_buffer{frame->GetPlane(2)}; + + LOG_TRACE(HW_GPU, + "Reading frame" + "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" + "output luma {}x{} stride {} chroma {}x{} stride {}", + in_luma_width, in_luma_height, in_luma_stride, in_luma_width / 2, in_luma_height / 2, + in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, out_luma_width, + out_luma_height, out_luma_stride); + + [[maybe_unused]] auto DecodeLinear = [&]() { + const auto alpha{static_cast(slot.config.planar_alpha.Value())}; + + for (s32 y = 0; y < in_luma_height; y++) { + const auto src_luma{y * in_luma_stride}; + const auto src_chroma{(y / 2) * in_chroma_stride}; + const auto dst{y * out_luma_stride}; + for (s32 x = 0; x < in_luma_width; x++) { + slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); + // Chroma samples are duplicated horizontally and vertically. + if constexpr (Planar) { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = + static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); + } else { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); + } + slot_surface[dst + x].a = alpha; } } - break; + }; + +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(); + return; } - case AV_PIX_FMT_NV12: { - // Frame from VA-API hardware - // This is already interleaved so just copy - const u8* chroma_src = frame->GetData(1); - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * stride; - const std::size_t dst = y * aligned_width; - std::memcpy(chroma_buffer.data() + dst, chroma_src + src, frame_width); +#endif + +#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) + const auto alpha_linear{static_cast(slot.config.planar_alpha.Value())}; + const auto alpha = + _mm_slli_epi64(_mm_set1_epi64x(static_cast(slot.config.planar_alpha.Value())), 48); + + const auto shuffle_mask = _mm_set_epi8(13, 15, 14, 12, 9, 11, 10, 8, 5, 7, 6, 4, 1, 3, 2, 0); + const auto sse_aligned_width = Common::AlignDown(in_luma_width, 16); + + for (s32 y = 0; y < in_luma_height; y++) { + const auto src_luma{y * in_luma_stride}; + const auto src_chroma{(y / 2) * in_chroma_stride}; + const auto dst{y * out_luma_stride}; + s32 x = 0; + for (; x < sse_aligned_width; x += 16) { + // clang-format off + // Prefetch next iteration's memory + _mm_prefetch((const char*)&luma_buffer[src_luma + x + 16], _MM_HINT_T0); + + // Load 8 bytes * 2 of 8-bit luma samples + // luma0 = 00 00 00 00 00 00 00 00 LL LL LL LL LL LL LL LL + auto luma0 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 0]); + auto luma1 = _mm_loadl_epi64((__m128i*)&luma_buffer[src_luma + x + 8]); + + __m128i chroma; + + if constexpr (Planar) { + _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); + _mm_prefetch((const char*)&chroma_v_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); + + // If Chroma is planar, we have separate U and V planes, load 8 bytes of each + // chroma_u0 = 00 00 00 00 00 00 00 00 UU UU UU UU UU UU UU UU + // chroma_v0 = 00 00 00 00 00 00 00 00 VV VV VV VV VV VV VV VV + auto chroma_u0 = _mm_loadl_epi64((__m128i*)&chroma_u_buffer[src_chroma + x / 2]); + auto chroma_v0 = _mm_loadl_epi64((__m128i*)&chroma_v_buffer[src_chroma + x / 2]); + + // Interleave the 8 bytes of U and V into a single 16 byte reg + // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU + chroma = _mm_unpacklo_epi8(chroma_u0, chroma_v0); + } else { + _mm_prefetch((const char*)&chroma_u_buffer[src_chroma + x / 2 + 8], _MM_HINT_T0); + + // Chroma is already interleaved in semiplanar format, just load 16 bytes + // chroma = VV UU VV UU VV UU VV UU VV UU VV UU VV UU VV UU + chroma = _mm_load_si128((__m128i*)&chroma_u_buffer[src_chroma + x]); + } + + // Convert the low 8 bytes of 8-bit luma into 16-bit luma + // luma0 = [00] [00] [00] [00] [00] [00] [00] [00] [LL] [LL] [LL] [LL] [LL] [LL] [LL] [LL] + // -> + // luma0 = [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] [00 LL] + luma0 = _mm_cvtepu8_epi16(luma0); + luma1 = _mm_cvtepu8_epi16(luma1); + + // Treat the 8 bytes of 8-bit chroma as 16-bit channels, this allows us to take both the + // U and V together as one element. Using chroma twice here duplicates the values, as we + // take element 0 from chroma, and then element 0 from chroma again, etc. We need to + // duplicate chroma horitonally as chroma is half the width of luma. + // chroma = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] + // -> + // chroma00 = [VV4 UU4] [VV4 UU4] [VV3 UU3] [VV3 UU3] [VV2 UU2] [VV2 UU2] [VV1 UU1] [VV1 UU1] + // chroma01 = [VV8 UU8] [VV8 UU8] [VV7 UU7] [VV7 UU7] [VV6 UU6] [VV6 UU6] [VV5 UU5] [VV5 UU5] + auto chroma00 = _mm_unpacklo_epi16(chroma, chroma); + auto chroma01 = _mm_unpackhi_epi16(chroma, chroma); + + // Interleave the 16-bit luma and chroma. + // luma0 = [008 LL8] [007 LL7] [006 LL6] [005 LL5] [004 LL4] [003 LL3] [002 LL2] [001 LL1] + // chroma00 = [VV8 UU8] [VV7 UU7] [VV6 UU6] [VV5 UU5] [VV4 UU4] [VV3 UU3] [VV2 UU2] [VV1 UU1] + // -> + // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] + // yuv1 = [VV8 UU8 008 LL8] [VV7 UU7 007 LL7] [VV6 UU6 006 LL6] [VV5 UU5 005 LL5] + auto yuv0 = _mm_unpacklo_epi16(luma0, chroma00); + auto yuv1 = _mm_unpackhi_epi16(luma0, chroma00); + auto yuv2 = _mm_unpacklo_epi16(luma1, chroma01); + auto yuv3 = _mm_unpackhi_epi16(luma1, chroma01); + + // Shuffle the luma/chroma into the channel ordering we actually want. The high byte of + // the luma which is now a constant 0 after converting 8-bit -> 16-bit is used as the + // alpha. Luma -> R, U -> G, V -> B, 0 -> A + // yuv0 = [VV4 UU4 004 LL4] [VV3 UU3 003 LL3] [VV2 UU2 002 LL2] [VV1 UU1 001 LL1] + // -> + // yuv0 = [AA4 VV4 UU4 LL4] [AA3 VV3 UU3 LL3] [AA2 VV2 UU2 LL2] [AA1 VV1 UU1 LL1] + yuv0 = _mm_shuffle_epi8(yuv0, shuffle_mask); + yuv1 = _mm_shuffle_epi8(yuv1, shuffle_mask); + yuv2 = _mm_shuffle_epi8(yuv2, shuffle_mask); + yuv3 = _mm_shuffle_epi8(yuv3, shuffle_mask); + + // Extend the 8-bit channels we have into 16-bits, as that's the target surface format. + // Since this turns just the low 8 bytes into 16 bytes, the second of + // each operation here right shifts the register by 8 to get the high pixels. + // yuv0 = [AA4] [VV4] [UU4] [LL4] [AA3] [VV3] [UU3] [LL3] [AA2] [VV2] [UU2] [LL2] [AA1] [VV1] [UU1] [LL1] + // -> + // yuv01 = [002 AA2] [002 VV2] [002 UU2] [002 LL2] [001 AA1] [001 VV1] [001 UU1] [001 LL1] + // yuv23 = [004 AA4] [004 VV4] [004 UU4] [004 LL4] [003 AA3] [003 VV3] ]003 UU3] [003 LL3] + auto yuv01 = _mm_cvtepu8_epi16(yuv0); + auto yuv23 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv0, 8)); + auto yuv45 = _mm_cvtepu8_epi16(yuv1); + auto yuv67 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv1, 8)); + auto yuv89 = _mm_cvtepu8_epi16(yuv2); + auto yuv1011 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv2, 8)); + auto yuv1213 = _mm_cvtepu8_epi16(yuv3); + auto yuv1415 = _mm_cvtepu8_epi16(_mm_srli_si128(yuv3, 8)); + + // Left-shift all 16-bit channels by 2, this is to get us into a 10-bit format instead + // of 8, which is the format alpha is in, as well as other blending values. + yuv01 = _mm_slli_epi16(yuv01, 2); + yuv23 = _mm_slli_epi16(yuv23, 2); + yuv45 = _mm_slli_epi16(yuv45, 2); + yuv67 = _mm_slli_epi16(yuv67, 2); + yuv89 = _mm_slli_epi16(yuv89, 2); + yuv1011 = _mm_slli_epi16(yuv1011, 2); + yuv1213 = _mm_slli_epi16(yuv1213, 2); + yuv1415 = _mm_slli_epi16(yuv1415, 2); + + // OR in the planar alpha, this has already been duplicated and shifted into position, + // and just fills in the AA channels with the actual alpha value. + yuv01 = _mm_or_si128(yuv01, alpha); + yuv23 = _mm_or_si128(yuv23, alpha); + yuv45 = _mm_or_si128(yuv45, alpha); + yuv67 = _mm_or_si128(yuv67, alpha); + yuv89 = _mm_or_si128(yuv89, alpha); + yuv1011 = _mm_or_si128(yuv1011, alpha); + yuv1213 = _mm_or_si128(yuv1213, alpha); + yuv1415 = _mm_or_si128(yuv1415, alpha); + + // Store out the pixels. One pixel is now 8 bytes, so each store is 2 pixels. + // [AA AA] [VV VV] [UU UU] [LL LL] [AA AA] [VV VV] [UU UU] [LL LL] + _mm_store_si128((__m128i*)&slot_surface[dst + x + 0], yuv01); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 2], yuv23); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 4], yuv45); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 6], yuv67); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 8], yuv89); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 10], yuv1011); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 12], yuv1213); + _mm_store_si128((__m128i*)&slot_surface[dst + x + 14], yuv1415); + + // clang-format on + } + + for (; x < in_luma_width; x++) { + slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); + // Chroma samples are duplicated horizontally and vertically. + if constexpr (Planar) { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = + static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); + } else { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); + } + slot_surface[dst + x].a = alpha_linear; } - break; } - default: - ASSERT(false); - break; - } - host1x.GMMU().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), - chroma_buffer.size()); +#else + DecodeLinear(); +#endif } -} // namespace Host1x +template +void Vic::ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame) { + if constexpr (!Planar) { + ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); + return; + } + const auto out_luma_width{slot.surface_config.slot_surface_width + 1}; + const auto out_luma_height{(slot.surface_config.slot_surface_height + 1) * 2}; + const auto out_luma_stride{out_luma_width}; -} // namespace Tegra + slot_surface.resize_destructive(out_luma_width * out_luma_height); + + const auto in_luma_width{std::min(frame->GetWidth(), static_cast(out_luma_width))}; + [[maybe_unused]] const auto in_luma_height{ + std::min(frame->GetHeight(), static_cast(out_luma_height))}; + const auto in_luma_stride{frame->GetStride(0)}; + + [[maybe_unused]] const auto in_chroma_width{(frame->GetWidth() + 1) / 2}; + const auto in_chroma_height{(frame->GetHeight() + 1) / 2}; + const auto in_chroma_stride{frame->GetStride(1)}; + + const auto* luma_buffer{frame->GetPlane(0)}; + const auto* chroma_u_buffer{frame->GetPlane(1)}; + const auto* chroma_v_buffer{frame->GetPlane(2)}; + + LOG_TRACE(HW_GPU, + "Reading frame" + "\ninput luma {}x{} stride {} chroma {}x{} stride {}\n" + "output luma {}x{} stride {} chroma {}x{} stride {}", + in_luma_width, in_luma_height, in_luma_stride, in_chroma_width, in_chroma_height, + in_chroma_stride, out_luma_width, out_luma_height, out_luma_stride, + out_luma_width / 2, out_luma_height / 2, out_luma_stride); + + [[maybe_unused]] auto DecodeLinear = [&]() { + auto DecodeBobField = [&]() { + const auto alpha{static_cast(slot.config.planar_alpha.Value())}; + + for (s32 y = static_cast(TopField == false); y < in_chroma_height * 2; y += 2) { + const auto src_luma{y * in_luma_stride}; + const auto src_chroma{(y / 2) * in_chroma_stride}; + const auto dst{y * out_luma_stride}; + for (s32 x = 0; x < in_luma_width; x++) { + slot_surface[dst + x].r = static_cast(luma_buffer[src_luma + x] << 2); + if constexpr (Planar) { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + x / 2] << 2); + slot_surface[dst + x].b = + static_cast(chroma_v_buffer[src_chroma + x / 2] << 2); + } else { + slot_surface[dst + x].g = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 0] << 2); + slot_surface[dst + x].b = + static_cast(chroma_u_buffer[src_chroma + (x & ~1) + 1] << 2); + } + slot_surface[dst + x].a = alpha; + } + + s32 other_line{}; + if constexpr (TopField) { + other_line = (y + 1) * out_luma_stride; + } else { + other_line = (y - 1) * out_luma_stride; + } + std::memcpy(&slot_surface[other_line], &slot_surface[dst], + out_luma_width * sizeof(Pixel)); + } + }; + + switch (slot.config.deinterlace_mode) { + case DXVAHD_DEINTERLACE_MODE_PRIVATE::WEAVE: + // Due to the fact that we do not write to memory in nvdec, we cannot use Weave as it + // relies on the previous frame. + DecodeBobField(); + break; + case DXVAHD_DEINTERLACE_MODE_PRIVATE::BOB_FIELD: + DecodeBobField(); + break; + case DXVAHD_DEINTERLACE_MODE_PRIVATE::DISI1: + // Due to the fact that we do not write to memory in nvdec, we cannot use DISI1 as it + // relies on previous/next frames. + DecodeBobField(); + break; + default: + UNIMPLEMENTED_MSG("Deinterlace mode {} not implemented!", + static_cast(slot.config.deinterlace_mode.Value())); + break; + } + }; + + DecodeLinear(); +} + +template +void Vic::ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame) { + switch (slot.config.frame_format) { + case DXVAHD_FRAME_FORMAT::PROGRESSIVE: + ReadProgressiveY8__V8U8_N420(slot, offsets, std::move(frame)); + break; + case DXVAHD_FRAME_FORMAT::TOP_FIELD: + ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); + break; + case DXVAHD_FRAME_FORMAT::BOTTOM_FIELD: + ReadInterlacedY8__V8U8_N420(slot, offsets, std::move(frame)); + break; + default: + LOG_ERROR(HW_GPU, "Unknown deinterlace format {}", + static_cast(slot.config.frame_format.Value())); + break; + } +} + +void Vic::Blend(const ConfigStruct& config, const SlotStruct& slot) { + constexpr auto add_one([](u32 v) -> u32 { return v != 0 ? v + 1 : 0; }); + + auto source_left{add_one(static_cast(slot.config.source_rect_left.Value()))}; + auto source_right{add_one(static_cast(slot.config.source_rect_right.Value()))}; + auto source_top{add_one(static_cast(slot.config.source_rect_top.Value()))}; + auto source_bottom{add_one(static_cast(slot.config.source_rect_bottom.Value()))}; + + const auto dest_left{add_one(static_cast(slot.config.dest_rect_left.Value()))}; + const auto dest_right{add_one(static_cast(slot.config.dest_rect_right.Value()))}; + const auto dest_top{add_one(static_cast(slot.config.dest_rect_top.Value()))}; + const auto dest_bottom{add_one(static_cast(slot.config.dest_rect_bottom.Value()))}; + + auto rect_left{add_one(config.output_config.target_rect_left.Value())}; + auto rect_right{add_one(config.output_config.target_rect_right.Value())}; + auto rect_top{add_one(config.output_config.target_rect_top.Value())}; + auto rect_bottom{add_one(config.output_config.target_rect_bottom.Value())}; + + rect_left = std::max(rect_left, dest_left); + rect_right = std::min(rect_right, dest_right); + rect_top = std::max(rect_top, dest_top); + rect_bottom = std::min(rect_bottom, dest_bottom); + + source_left = std::max(source_left, rect_left); + source_right = std::min(source_right, rect_right); + source_top = std::max(source_top, rect_top); + source_bottom = std::min(source_bottom, rect_bottom); + + if (source_left >= source_right || source_top >= source_bottom) { + return; + } + + const auto out_surface_width{config.output_surface_config.out_surface_width + 1}; + [[maybe_unused]] const auto out_surface_height{config.output_surface_config.out_surface_height + + 1}; + const auto in_surface_width{slot.surface_config.slot_surface_width + 1}; + + source_bottom = std::min(source_bottom, out_surface_height); + source_right = std::min(source_right, out_surface_width); + + // TODO Alpha blending. No games I've seen use more than a single surface or supply an alpha + // below max, so it's ignored for now. + + if (!slot.color_matrix.matrix_enable) { + const auto copy_width = std::min(source_right - source_left, rect_right - rect_left); + + for (u32 y = source_top; y < source_bottom; y++) { + const auto dst_line = y * out_surface_width; + const auto src_line = y * in_surface_width; + std::memcpy(&output_surface[dst_line + rect_left], + &slot_surface[src_line + source_left], copy_width * sizeof(Pixel)); + } + } else { + // clang-format off + // Colour conversion is enabled, this is a 3x4 * 4x1 matrix multiplication, resulting in a 3x1 matrix. + // | r0c0 r0c1 r0c2 r0c3 | | R | | R | + // | r1c0 r1c1 r1c2 r1c3 | * | G | = | G | + // | r2c0 r2c1 r2c2 r2c3 | | B | | B | + // | 1 | + // clang-format on + + [[maybe_unused]] auto DecodeLinear = [&]() { + const auto r0c0 = static_cast(slot.color_matrix.matrix_coeff00.Value()); + const auto r0c1 = static_cast(slot.color_matrix.matrix_coeff01.Value()); + const auto r0c2 = static_cast(slot.color_matrix.matrix_coeff02.Value()); + const auto r0c3 = static_cast(slot.color_matrix.matrix_coeff03.Value()); + const auto r1c0 = static_cast(slot.color_matrix.matrix_coeff10.Value()); + const auto r1c1 = static_cast(slot.color_matrix.matrix_coeff11.Value()); + const auto r1c2 = static_cast(slot.color_matrix.matrix_coeff12.Value()); + const auto r1c3 = static_cast(slot.color_matrix.matrix_coeff13.Value()); + const auto r2c0 = static_cast(slot.color_matrix.matrix_coeff20.Value()); + const auto r2c1 = static_cast(slot.color_matrix.matrix_coeff21.Value()); + const auto r2c2 = static_cast(slot.color_matrix.matrix_coeff22.Value()); + const auto r2c3 = static_cast(slot.color_matrix.matrix_coeff23.Value()); + + const auto shift = static_cast(slot.color_matrix.matrix_r_shift.Value()); + const auto clamp_min = static_cast(slot.config.soft_clamp_low.Value()); + const auto clamp_max = static_cast(slot.config.soft_clamp_high.Value()); + + auto MatMul = [&](const Pixel& in_pixel) -> std::tuple { + auto r = static_cast(in_pixel.r); + auto g = static_cast(in_pixel.g); + auto b = static_cast(in_pixel.b); + + r = in_pixel.r * r0c0 + in_pixel.g * r0c1 + in_pixel.b * r0c2; + g = in_pixel.r * r1c0 + in_pixel.g * r1c1 + in_pixel.b * r1c2; + b = in_pixel.r * r2c0 + in_pixel.g * r2c1 + in_pixel.b * r2c2; + + r >>= shift; + g >>= shift; + b >>= shift; + + r += r0c3; + g += r1c3; + b += r2c3; + + r >>= 8; + g >>= 8; + b >>= 8; + + return {r, g, b, static_cast(in_pixel.a)}; + }; + + for (u32 y = source_top; y < source_bottom; y++) { + const auto src{y * in_surface_width + source_left}; + const auto dst{y * out_surface_width + rect_left}; + for (u32 x = source_left; x < source_right; x++) { + auto [r, g, b, a] = MatMul(slot_surface[src + x]); + + r = std::clamp(r, clamp_min, clamp_max); + g = std::clamp(g, clamp_min, clamp_max); + b = std::clamp(b, clamp_min, clamp_max); + a = std::clamp(a, clamp_min, clamp_max); + + output_surface[dst + x] = {static_cast(r), static_cast(g), + static_cast(b), static_cast(a)}; + } + } + }; + +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(); + return; + } +#endif + +#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) + // Fill the columns, e.g + // c0 = [00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] + + const auto c0 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff20.Value()), + static_cast(slot.color_matrix.matrix_coeff10.Value()), + static_cast(slot.color_matrix.matrix_coeff00.Value())); + const auto c1 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff21.Value()), + static_cast(slot.color_matrix.matrix_coeff11.Value()), + static_cast(slot.color_matrix.matrix_coeff01.Value())); + const auto c2 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff22.Value()), + static_cast(slot.color_matrix.matrix_coeff12.Value()), + static_cast(slot.color_matrix.matrix_coeff02.Value())); + const auto c3 = _mm_set_epi32(0, static_cast(slot.color_matrix.matrix_coeff23.Value()), + static_cast(slot.color_matrix.matrix_coeff13.Value()), + static_cast(slot.color_matrix.matrix_coeff03.Value())); + + // Set the matrix right-shift as a single element. + const auto shift = + _mm_set_epi32(0, 0, 0, static_cast(slot.color_matrix.matrix_r_shift.Value())); + + // Set every 16-bit value to the soft clamp values for clamping every 16-bit channel. + const auto clamp_min = _mm_set1_epi16(static_cast(slot.config.soft_clamp_low.Value())); + const auto clamp_max = + _mm_set1_epi16(static_cast(slot.config.soft_clamp_high.Value())); + + // clang-format off + + auto MatMul = [](__m128i& p, const __m128i& col0, const __m128i& col1, const __m128i& col2, + const __m128i& col3, const __m128i& trm_shift) -> __m128i { + // Duplicate the 32-bit channels, e.g + // p = [AA AA AA AA] [BB BB BB BB] [GG GG GG GG] [RR RR RR RR] + // -> + // r = [RR4 RR4 RR4 RR4] [RR3 RR3 RR3 RR3] [RR2 RR2 RR2 RR2] [RR1 RR1 RR1 RR1] + auto r = _mm_shuffle_epi32(p, 0x0); + auto g = _mm_shuffle_epi32(p, 0x55); + auto b = _mm_shuffle_epi32(p, 0xAA); + + // Multiply the rows and columns c0 * r, c1 * g, c2 * b, e.g + // r = [RR4 RR4 RR4 RR4] [ RR3 RR3 RR3 RR3] [ RR2 RR2 RR2 RR2] [ RR1 RR1 RR1 RR1] + // * + // c0 = [ 00 00 00 00] [r2c0 r2c0 r2c0 r2c0] [r1c0 r1c0 r1c0 r1c0] [r0c0 r0c0 r0c0 r0c0] + r = _mm_mullo_epi32(r, col0); + g = _mm_mullo_epi32(g, col1); + b = _mm_mullo_epi32(b, col2); + + // Add them all together vertically, such that the 32-bit element + // out[0] = (r[0] * c0[0]) + (g[0] * c1[0]) + (b[0] * c2[0]) + auto out = _mm_add_epi32(_mm_add_epi32(r, g), b); + + // Shift the result by r_shift, as the TRM says + out = _mm_sra_epi32(out, trm_shift); + + // Add the final column. Because the 4x1 matrix has this row as 1, there's no need to + // multiply by it, and as per the TRM this column ignores r_shift, so it's just added + // here after shifting. + out = _mm_add_epi32(out, col3); + + // Shift the result back from S12.8 to integer values + return _mm_srai_epi32(out, 8); + }; + + for (u32 y = source_top; y < source_bottom; y++) { + const auto src{y * in_surface_width + source_left}; + const auto dst{y * out_surface_width + rect_left}; + for (u32 x = source_left; x < source_right; x += 8) { + // clang-format off + // Prefetch the next iteration's memory + _mm_prefetch((const char*)&slot_surface[src + x + 8], _MM_HINT_T0); + + // Load in pixels + // p01 = [AA AA] [BB BB] [GG GG] [RR RR] [AA AA] [BB BB] [GG GG] [RR RR] + auto p01 = _mm_load_si128((__m128i*)&slot_surface[src + x + 0]); + auto p23 = _mm_load_si128((__m128i*)&slot_surface[src + x + 2]); + auto p45 = _mm_load_si128((__m128i*)&slot_surface[src + x + 4]); + auto p67 = _mm_load_si128((__m128i*)&slot_surface[src + x + 6]); + + // Convert the 16-bit channels into 32-bit (unsigned), as the matrix values are + // 32-bit and to avoid overflow. + // p01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + // -> + // p01_lo = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] + // p01_hi = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] + auto p01_lo = _mm_cvtepu16_epi32(p01); + auto p01_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p01, 8)); + auto p23_lo = _mm_cvtepu16_epi32(p23); + auto p23_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p23, 8)); + auto p45_lo = _mm_cvtepu16_epi32(p45); + auto p45_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p45, 8)); + auto p67_lo = _mm_cvtepu16_epi32(p67); + auto p67_hi = _mm_cvtepu16_epi32(_mm_srli_si128(p67, 8)); + + // Matrix multiply the pixel, doing the colour conversion. + auto out0 = MatMul(p01_lo, c0, c1, c2, c3, shift); + auto out1 = MatMul(p01_hi, c0, c1, c2, c3, shift); + auto out2 = MatMul(p23_lo, c0, c1, c2, c3, shift); + auto out3 = MatMul(p23_hi, c0, c1, c2, c3, shift); + auto out4 = MatMul(p45_lo, c0, c1, c2, c3, shift); + auto out5 = MatMul(p45_hi, c0, c1, c2, c3, shift); + auto out6 = MatMul(p67_lo, c0, c1, c2, c3, shift); + auto out7 = MatMul(p67_hi, c0, c1, c2, c3, shift); + + // Pack the 32-bit channel pixels back into 16-bit using unsigned saturation + // out0 = [001 001 AA1 AA1] [001 001 BB1 BB1] [001 001 GG1 GG1] [001 001 RR1 RR1] + // out1 = [002 002 AA2 AA2] [002 002 BB2 BB2] [002 002 GG2 GG2] [002 002 RR2 RR2] + // -> + // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + auto done0 = _mm_packus_epi32(out0, out1); + auto done1 = _mm_packus_epi32(out2, out3); + auto done2 = _mm_packus_epi32(out4, out5); + auto done3 = _mm_packus_epi32(out6, out7); + + // Blend the original alpha back into the pixel, as the matrix multiply gives us a + // 3-channel output, not 4. + // 0x88 = b10001000, taking RGB from the first argument, A from the second argument. + // done0 = [002 002] [BB2 BB2] [GG2 GG2] [RR2 RR2] [001 001] [BB1 BB1] [GG1 GG1] [RR1 RR1] + // -> + // done0 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + done0 = _mm_blend_epi16(done0, p01, 0x88); + done1 = _mm_blend_epi16(done1, p23, 0x88); + done2 = _mm_blend_epi16(done2, p45, 0x88); + done3 = _mm_blend_epi16(done3, p67, 0x88); + + // Clamp the 16-bit channels to the soft-clamp min/max. + done0 = _mm_max_epu16(done0, clamp_min); + done1 = _mm_max_epu16(done1, clamp_min); + done2 = _mm_max_epu16(done2, clamp_min); + done3 = _mm_max_epu16(done3, clamp_min); + + done0 = _mm_min_epu16(done0, clamp_max); + done1 = _mm_min_epu16(done1, clamp_max); + done2 = _mm_min_epu16(done2, clamp_max); + done3 = _mm_min_epu16(done3, clamp_max); + + // Store the pixels to the output surface. + _mm_store_si128((__m128i*)&output_surface[dst + x + 0], done0); + _mm_store_si128((__m128i*)&output_surface[dst + x + 2], done1); + _mm_store_si128((__m128i*)&output_surface[dst + x + 4], done2); + _mm_store_si128((__m128i*)&output_surface[dst + x + 6], done3); + + } + } + // clang-format on +#else + DecodeLinear(); +#endif + } +} + +void Vic::WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config) { + constexpr u32 BytesPerPixel = 1; + + auto surface_width{output_surface_config.out_surface_width + 1}; + auto surface_height{output_surface_config.out_surface_height + 1}; + const auto surface_stride{surface_width}; + + const auto out_luma_width = output_surface_config.out_luma_width + 1; + const auto out_luma_height = output_surface_config.out_luma_height + 1; + const auto out_luma_stride = Common::AlignUp(out_luma_width * BytesPerPixel, 0x10); + const auto out_luma_size = out_luma_height * out_luma_stride; + + const auto out_chroma_width = output_surface_config.out_chroma_width + 1; + const auto out_chroma_height = output_surface_config.out_chroma_height + 1; + const auto out_chroma_stride = Common::AlignUp(out_chroma_width * BytesPerPixel * 2, 0x10); + const auto out_chroma_size = out_chroma_height * out_chroma_stride; + + surface_width = std::min(surface_width, out_luma_width); + surface_height = std::min(surface_height, out_luma_height); + + [[maybe_unused]] auto DecodeLinear = [&](std::span out_luma, std::span out_chroma) { + for (u32 y = 0; y < surface_height; ++y) { + const auto src_luma = y * surface_stride; + const auto dst_luma = y * out_luma_stride; + const auto src_chroma = y * surface_stride; + const auto dst_chroma = (y / 2) * out_chroma_stride; + for (u32 x = 0; x < surface_width; x += 2) { + out_luma[dst_luma + x + 0] = + static_cast(output_surface[src_luma + x + 0].r >> 2); + out_luma[dst_luma + x + 1] = + static_cast(output_surface[src_luma + x + 1].r >> 2); + out_chroma[dst_chroma + x + 0] = + static_cast(output_surface[src_chroma + x].g >> 2); + out_chroma[dst_chroma + x + 1] = + static_cast(output_surface[src_chroma + x].b >> 2); + } + } + }; + + auto Decode = [&](std::span out_luma, std::span out_chroma) { +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(out_luma, out_chroma); + return; + } +#endif + +#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) + // luma_mask = [00 00] [00 00] [00 00] [FF FF] [00 00] [00 00] [00 00] [FF FF] + const auto luma_mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); + + const auto sse_aligned_width = Common::AlignDown(surface_width, 16); + + for (u32 y = 0; y < surface_height; ++y) { + const auto src = y * surface_stride; + const auto dst_luma = y * out_luma_stride; + const auto dst_chroma = (y / 2) * out_chroma_stride; + u32 x = 0; + for (; x < sse_aligned_width; x += 16) { + // clang-format off + // Prefetch the next cache lines, 2 per iteration + _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); + _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); + + // Load the 64-bit pixels, 2 per variable. + auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); + auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); + auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); + auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); + auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); + auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); + auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); + auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); + + // Split out the luma of each pixel using the luma_mask above. + // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] + // -> + // l01 = [002 002] [002 002] [002 002] [LL2 LL2] [001 001] [001 001] [001 001] [LL1 LL1] + auto l01 = _mm_and_si128(pixel01, luma_mask); + auto l23 = _mm_and_si128(pixel23, luma_mask); + auto l45 = _mm_and_si128(pixel45, luma_mask); + auto l67 = _mm_and_si128(pixel67, luma_mask); + auto l89 = _mm_and_si128(pixel89, luma_mask); + auto l1011 = _mm_and_si128(pixel1011, luma_mask); + auto l1213 = _mm_and_si128(pixel1213, luma_mask); + auto l1415 = _mm_and_si128(pixel1415, luma_mask); + + // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. + // l01 = [002 002 002 002] [002 002 LL2 LL2] [001 001 001 001] [001 001 LL1 LL1] + // l23 = [004 004 004 004] [004 004 LL4 LL4] [003 003 003 003] [003 003 LL3 LL3] + // -> + // l0123 = [004 004] [LL4 LL4] [003 003] [LL3 LL3] [002 002] [LL2 LL2] [001 001] [LL1 LL1] + auto l0123 = _mm_packus_epi32(l01, l23); + auto l4567 = _mm_packus_epi32(l45, l67); + auto l891011 = _mm_packus_epi32(l89, l1011); + auto l12131415 = _mm_packus_epi32(l1213, l1415); + + // Pack 32-bit elements from 2 registers down into 16-bit elements in 1 register. + // l0123 = [004 004 LL4 LL4] [003 003 LL3 LL3] [002 002 LL2 LL2] [001 001 LL1 LL1] + // l4567 = [008 008 LL8 LL8] [007 007 LL7 LL7] [006 006 LL6 LL6] [005 005 LL5 LL5] + // -> + // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] + auto luma_lo = _mm_packus_epi32(l0123, l4567); + auto luma_hi = _mm_packus_epi32(l891011, l12131415); + + // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read + // and bringing the range back to 8-bit. + luma_lo = _mm_srli_epi16(luma_lo, 2); + luma_hi = _mm_srli_epi16(luma_hi, 2); + + // Pack with unsigned saturation the 16-bit values in 2 registers into 8-bit values in 1 register. + // luma_lo = [LL8 LL8] [LL7 LL7] [LL6 LL6] [LL5 LL5] [LL4 LL4] [LL3 LL3] [LL2 LL2] [LL1 LL1] + // luma_hi = [LL16 LL16] [LL15 LL15] [LL14 LL14] [LL13 LL13] [LL12 LL12] [LL11 LL11] [LL10 LL10] [LL9 LL9] + // -> + // luma = [LL16] [LL15] [LL14] [LL13] [LL12] [LL11] [LL10] [LL9] [LL8] [LL7] [LL6] [LL5] [LL4] [LL3] [LL2] [LL1] + auto luma = _mm_packus_epi16(luma_lo, luma_hi); + + // Store the 16 bytes of luma + _mm_store_si128((__m128i*)&out_luma[dst_luma + x], luma); + + if (y % 2 == 0) { + // Chroma, done every other line as it's half the height of luma. + + // Shift the register right by 2 bytes (not bits), to kick out the 16-bit luma. + // We can do this instead of &'ing a mask and then shifting. + // pixel01 = [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] [LL1 LL1] + // -> + // c01 = [ 00 00] [AA2 AA2] [VV2 VV2] [UU2 UU2] [LL2 LL2] [AA1 AA1] [VV1 VV1] [UU1 UU1] + auto c01 = _mm_srli_si128(pixel01, 2); + auto c23 = _mm_srli_si128(pixel23, 2); + auto c45 = _mm_srli_si128(pixel45, 2); + auto c67 = _mm_srli_si128(pixel67, 2); + auto c89 = _mm_srli_si128(pixel89, 2); + auto c1011 = _mm_srli_si128(pixel1011, 2); + auto c1213 = _mm_srli_si128(pixel1213, 2); + auto c1415 = _mm_srli_si128(pixel1415, 2); + + // Interleave the lower 8 bytes as 32-bit elements from 2 registers into 1 register. + // This has the effect of skipping every other chroma value horitonally, + // notice the high pixels UU2/UU4 are skipped. + // This is intended as N420 chroma width is half the luma width. + // c01 = [ 00 00 AA2 AA2] [VV2 VV2 UU2 UU2] [LL2 LL2 AA1 AA1] [VV1 VV1 UU1 UU1] + // c23 = [ 00 00 AA4 AA4] [VV4 VV4 UU4 UU4] [LL4 LL4 AA3 AA3] [VV3 VV3 UU3 UU3] + // -> + // c0123 = [LL4 LL4 AA3 AA3] [LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3] [VV1 VV1 UU1 UU1] + auto c0123 = _mm_unpacklo_epi32(c01, c23); + auto c4567 = _mm_unpacklo_epi32(c45, c67); + auto c891011 = _mm_unpacklo_epi32(c89, c1011); + auto c12131415 = _mm_unpacklo_epi32(c1213, c1415); + + // Interleave the low 64-bit elements from 2 registers into 1. + // c0123 = [LL4 LL4 AA3 AA3 LL2 LL2 AA1 AA1] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] + // c4567 = [LL8 LL8 AA7 AA7 LL6 LL6 AA5 AA5] [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] + // -> + // chroma_lo = [VV7 VV7 UU7 UU7 VV5 VV5 UU5 UU5] [VV3 VV3 UU3 UU3 VV1 VV1 UU1 UU1] + auto chroma_lo = _mm_unpacklo_epi64(c0123, c4567); + auto chroma_hi = _mm_unpacklo_epi64(c891011, c12131415); + + // Right-shift the 16-bit elements by 2, un-doing the left shift by 2 on read + // and bringing the range back to 8-bit. + chroma_lo = _mm_srli_epi16(chroma_lo, 2); + chroma_hi = _mm_srli_epi16(chroma_hi, 2); + + // Pack with unsigned saturation the 16-bit elements from 2 registers into 8-bit elements in 1 register. + // chroma_lo = [ VV7 VV7] [ UU7 UU7] [ VV5 VV5] [ UU5 UU5] [ VV3 VV3] [ UU3 UU3] [VV1 VV1] [UU1 UU1] + // chroma_hi = [VV15 VV15] [UU15 UU15] [VV13 VV13] [UU13 UU13] [VV11 VV11] [UU11 UU11] [VV9 VV9] [UU9 UU9] + // -> + // chroma = [VV15] [UU15] [VV13] [UU13] [VV11] [UU11] [VV9] [UU9] [VV7] [UU7] [VV5] [UU5] [VV3] [UU3] [VV1] [UU1] + auto chroma = _mm_packus_epi16(chroma_lo, chroma_hi); + + // Store the 16 bytes of chroma. + _mm_store_si128((__m128i*)&out_chroma[dst_chroma + x + 0], chroma); + } + + // clang-format on + } + + const auto src_chroma = y * surface_stride; + for (; x < surface_width; x += 2) { + out_luma[dst_luma + x + 0] = static_cast(output_surface[src + x + 0].r >> 2); + out_luma[dst_luma + x + 1] = static_cast(output_surface[src + x + 1].r >> 2); + out_chroma[dst_chroma + x + 0] = + static_cast(output_surface[src_chroma + x].g >> 2); + out_chroma[dst_chroma + x + 1] = + static_cast(output_surface[src_chroma + x].b >> 2); + } + } +#else + DecodeLinear(out_luma, out_chroma); +#endif + }; + + switch (output_surface_config.out_block_kind) { + case BLK_KIND::GENERIC_16Bx2: { + const u32 block_height = static_cast(output_surface_config.out_block_height); + const auto out_luma_swizzle_size = Texture::CalculateSize( + true, BytesPerPixel, out_luma_width, out_luma_height, 1, block_height, 0); + const auto out_chroma_swizzle_size = Texture::CalculateSize( + true, BytesPerPixel * 2, out_chroma_width, out_chroma_height, 1, block_height, 0); + + LOG_TRACE( + HW_GPU, + "Writing Y8__V8U8_N420 swizzled frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}\n" + "\toutput luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n", + "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}", + surface_width, surface_height, surface_stride * BytesPerPixel, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size, block_height, out_luma_swizzle_size, out_chroma_width, + out_chroma_height, out_chroma_stride, out_chroma_size, block_height, + out_chroma_swizzle_size); + + luma_scratch.resize_destructive(out_luma_size); + chroma_scratch.resize_destructive(out_chroma_size); + + Decode(luma_scratch, chroma_scratch); + + Tegra::Memory::GpuGuestMemoryScoped out_luma( + memory_manager, regs.output_surface.luma.Address(), out_luma_swizzle_size, + &swizzle_scratch); + + if (block_height == 1) { + SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, + out_luma_height); + } else { + Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, + out_luma_height, 1, block_height, 0, 1); + } + + Tegra::Memory::GpuGuestMemoryScoped + out_chroma(memory_manager, regs.output_surface.chroma_u.Address(), + out_chroma_swizzle_size, &swizzle_scratch); + + if (block_height == 1) { + SwizzleSurface(out_chroma, out_chroma_stride, chroma_scratch, out_chroma_stride, + out_chroma_height); + } else { + Texture::SwizzleTexture(out_chroma, chroma_scratch, BytesPerPixel, out_chroma_width, + out_chroma_height, 1, block_height, 0, 1); + } + } break; + case BLK_KIND::PITCH: { + LOG_TRACE( + HW_GPU, + "Writing Y8__V8U8_N420 swizzled frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}\n" + "\toutput luma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}\n", + "\toutput chroma {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}", + surface_width, surface_height, surface_stride * BytesPerPixel, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size, out_chroma_width, out_chroma_height, out_chroma_stride, + out_chroma_size); + + // Unfortunately due to a driver bug or game bug, the chroma address can be not + // appropriately spaced from the luma, so the luma of size out_stride * height runs into the + // top of the chroma buffer. Unfortunately that removes an optimisation here where we could + // create guest spans and decode into game memory directly to avoid the memory copy from + // scratch to game. Due to this bug, we must write the luma first, and then the chroma + // afterwards to re-overwrite the luma being too large. + luma_scratch.resize_destructive(out_luma_size); + chroma_scratch.resize_destructive(out_chroma_size); + + Decode(luma_scratch, chroma_scratch); + + memory_manager.WriteBlock(regs.output_surface.luma.Address(), luma_scratch.data(), + out_luma_size); + memory_manager.WriteBlock(regs.output_surface.chroma_u.Address(), chroma_scratch.data(), + out_chroma_size); + } break; + default: + UNREACHABLE(); + break; + } +} + +template +void Vic::WriteABGR(const OutputSurfaceConfig& output_surface_config) { + constexpr u32 BytesPerPixel = 4; + + auto surface_width{output_surface_config.out_surface_width + 1}; + auto surface_height{output_surface_config.out_surface_height + 1}; + const auto surface_stride{surface_width}; + + const auto out_luma_width = output_surface_config.out_luma_width + 1; + const auto out_luma_height = output_surface_config.out_luma_height + 1; + const auto out_luma_stride = Common ::AlignUp(out_luma_width * BytesPerPixel, 0x10); + const auto out_luma_size = out_luma_height * out_luma_stride; + + surface_width = std::min(surface_width, out_luma_width); + surface_height = std::min(surface_height, out_luma_height); + + [[maybe_unused]] auto DecodeLinear = [&](std::span out_buffer) { + for (u32 y = 0; y < surface_height; y++) { + const auto src = y * surface_stride; + const auto dst = y * out_luma_stride; + for (u32 x = 0; x < surface_width; x++) { + if constexpr (Format == VideoPixelFormat::A8R8G8B8) { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } else { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } + } + } + }; + + auto Decode = [&](std::span out_buffer) { +#if defined(ARCHITECTURE_x86_64) + if (!has_sse41) { + DecodeLinear(out_buffer); + return; + } +#endif + +#if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64) + constexpr size_t SseAlignment = 16; + const auto sse_aligned_width = Common::AlignDown(surface_width, SseAlignment); + + for (u32 y = 0; y < surface_height; y++) { + const auto src = y * surface_stride; + const auto dst = y * out_luma_stride; + u32 x = 0; + for (; x < sse_aligned_width; x += SseAlignment) { + // clang-format off + // Prefetch the next 2 cache lines + _mm_prefetch((const char*)&output_surface[src + x + 16], _MM_HINT_T0); + _mm_prefetch((const char*)&output_surface[src + x + 24], _MM_HINT_T0); + + // Load the pixels, 16-bit channels, 8 bytes per pixel, e.g + // pixel01 = [AA AA BB BB GG GG RR RR AA AA BB BB GG GG RR RR + auto pixel01 = _mm_load_si128((__m128i*)&output_surface[src + x + 0]); + auto pixel23 = _mm_load_si128((__m128i*)&output_surface[src + x + 2]); + auto pixel45 = _mm_load_si128((__m128i*)&output_surface[src + x + 4]); + auto pixel67 = _mm_load_si128((__m128i*)&output_surface[src + x + 6]); + auto pixel89 = _mm_load_si128((__m128i*)&output_surface[src + x + 8]); + auto pixel1011 = _mm_load_si128((__m128i*)&output_surface[src + x + 10]); + auto pixel1213 = _mm_load_si128((__m128i*)&output_surface[src + x + 12]); + auto pixel1415 = _mm_load_si128((__m128i*)&output_surface[src + x + 14]); + + // Right-shift the channels by 16 to un-do the left shit on read and bring the range + // back to 8-bit. + pixel01 = _mm_srli_epi16(pixel01, 2); + pixel23 = _mm_srli_epi16(pixel23, 2); + pixel45 = _mm_srli_epi16(pixel45, 2); + pixel67 = _mm_srli_epi16(pixel67, 2); + pixel89 = _mm_srli_epi16(pixel89, 2); + pixel1011 = _mm_srli_epi16(pixel1011, 2); + pixel1213 = _mm_srli_epi16(pixel1213, 2); + pixel1415 = _mm_srli_epi16(pixel1415, 2); + + // Pack with unsigned saturation 16-bit channels from 2 registers into 8-bit channels in 1 register. + // pixel01 = [AA2 AA2] [BB2 BB2] [GG2 GG2] [RR2 RR2] [AA1 AA1] [BB1 BB1] [GG1 GG1] [RR1 RR1] + // pixel23 = [AA4 AA4] [BB4 BB4] [GG4 GG4] [RR4 RR4] [AA3 AA3] [BB3 BB3] [GG3 GG3] [RR3 RR3] + // -> + // pixels0_lo = [AA4] [BB4] [GG4] [RR4] [AA3] [BB3] [GG3] [RR3] [AA2] [BB2] [GG2] [RR2] [AA1] [BB1] [GG1] [RR1] + auto pixels0_lo = _mm_packus_epi16(pixel01, pixel23); + auto pixels0_hi = _mm_packus_epi16(pixel45, pixel67); + auto pixels1_lo = _mm_packus_epi16(pixel89, pixel1011); + auto pixels1_hi = _mm_packus_epi16(pixel1213, pixel1415); + + if constexpr (Format == VideoPixelFormat::A8R8G8B8) { + const auto shuffle = + _mm_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2); + + // Our pixels are ABGR (big-endian) by default, if ARGB is needed, we need to shuffle. + // pixels0_lo = [AA4 BB4 GG4 RR4] [AA3 BB3 GG3 RR3] [AA2 BB2 GG2 RR2] [AA1 BB1 GG1 RR1] + // -> + // pixels0_lo = [AA4 RR4 GG4 BB4] [AA3 RR3 GG3 BB3] [AA2 RR2 GG2 BB2] [AA1 RR1 GG1 BB1] + pixels0_lo = _mm_shuffle_epi8(pixels0_lo, shuffle); + pixels0_hi = _mm_shuffle_epi8(pixels0_hi, shuffle); + pixels1_lo = _mm_shuffle_epi8(pixels1_lo, shuffle); + pixels1_hi = _mm_shuffle_epi8(pixels1_hi, shuffle); + } + + // Store the pixels + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 0], pixels0_lo); + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 16], pixels0_hi); + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 32], pixels1_lo); + _mm_store_si128((__m128i*)&out_buffer[dst + x * 4 + 48], pixels1_hi); + + // clang-format on + } + + for (; x < surface_width; x++) { + if constexpr (Format == VideoPixelFormat::A8R8G8B8) { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } else { + out_buffer[dst + x * 4 + 0] = static_cast(output_surface[src + x].r >> 2); + out_buffer[dst + x * 4 + 1] = static_cast(output_surface[src + x].g >> 2); + out_buffer[dst + x * 4 + 2] = static_cast(output_surface[src + x].b >> 2); + out_buffer[dst + x * 4 + 3] = static_cast(output_surface[src + x].a >> 2); + } + } + } +#else + DecodeLinear(out_buffer); +#endif + }; + + switch (output_surface_config.out_block_kind) { + case BLK_KIND::GENERIC_16Bx2: { + const u32 block_height = static_cast(output_surface_config.out_block_height); + const auto out_swizzle_size = Texture::CalculateSize(true, BytesPerPixel, out_luma_width, + out_luma_height, 1, block_height, 0); + + LOG_TRACE( + HW_GPU, + "Writing ABGR swizzled frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}\n" + "\toutput surface {}x{} stride {} size 0x{:X} block height {} swizzled size 0x{:X}", + surface_width, surface_height, surface_stride * BytesPerPixel, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size, block_height, out_swizzle_size); + + luma_scratch.resize_destructive(out_luma_size); + + Decode(luma_scratch); + + Tegra::Memory::GpuGuestMemoryScoped out_luma( + memory_manager, regs.output_surface.luma.Address(), out_swizzle_size, &swizzle_scratch); + + if (block_height == 1) { + SwizzleSurface(out_luma, out_luma_stride, luma_scratch, out_luma_stride, + out_luma_height); + } else { + Texture::SwizzleTexture(out_luma, luma_scratch, BytesPerPixel, out_luma_width, + out_luma_height, 1, block_height, 0, 1); + } + + } break; + case BLK_KIND::PITCH: { + LOG_TRACE(HW_GPU, + "Writing ABGR pitch frame\n" + "\tinput surface {}x{} stride {} size 0x{:X}" + "\toutput surface {}x{} stride {} size 0x{:X}", + surface_width, surface_height, surface_stride, + surface_stride * surface_height * BytesPerPixel, out_luma_width, out_luma_height, + out_luma_stride, out_luma_size); + + luma_scratch.resize_destructive(out_luma_size); + + Tegra::Memory::GpuGuestMemoryScoped out_luma( + memory_manager, regs.output_surface.luma.Address(), out_luma_size, &luma_scratch); + + Decode(out_luma); + } break; + default: + UNREACHABLE(); + break; + } +} + +} // namespace Tegra::Host1x diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h index 6c868f062..e7600941a 100644 --- a/src/video_core/host1x/vic.h +++ b/src/video_core/host1x/vic.h @@ -3,65 +3,646 @@ #pragma once +#include +#include #include +#include +#include #include "common/common_types.h" #include "common/scratch_buffer.h" +#include "video_core/cdma_pusher.h" -struct SwsContext; - -namespace Tegra { - -namespace Host1x { - +namespace Tegra::Host1x { class Host1x; class Nvdec; -union VicConfig; -class Vic { +struct Pixel { + u16 r; + u16 g; + u16 b; + u16 a; +}; + +// One underscore represents separate pixels. +// Double underscore represents separate planes. +// _N represents chroma subsampling, not a separate pixel. +enum class VideoPixelFormat : u32 { + A8 = 0, + L8 = 1, + A4L4 = 2, + L4A4 = 3, + R8 = 4, + A8L8 = 5, + L8A8 = 6, + R8G8 = 7, + G8R8 = 8, + B5G6R5 = 9, + R5G6B5 = 10, + B6G5R5 = 11, + R5G5B6 = 12, + A1B5G5R5 = 13, + A1R5G5B5 = 14, + B5G5R5A1 = 15, + R5G5B5A1 = 16, + A5B5G5R1 = 17, + A5R1G5B5 = 18, + B5G5R1A5 = 19, + R1G5B5A5 = 20, + X1B5G5R5 = 21, + X1R5G5B5 = 22, + B5G5R5X1 = 23, + R5G5B5X1 = 24, + A4B4G5R4 = 25, + A4R4G4B4 = 26, + B4G4R4A4 = 27, + R4G4B4A4 = 28, + B8G8R8 = 29, + R8G8B8 = 30, + A8B8G8R8 = 31, + A8R8G8B8 = 32, + B8G8R8A8 = 33, + R8G8B8A8 = 34, + X8B8G8R8 = 35, + X8R8G8B8 = 36, + B8G8R8X8 = 37, + R8G8B8X8 = 38, + A8B10G10R10 = 39, + A2R10G10B10 = 40, + B10G10R10A2 = 41, + R10G10B10A2 = 42, + A4P4 = 43, + P4A4 = 44, + P8A8 = 45, + A8P8 = 46, + P8 = 47, + P1 = 48, + U8V8 = 49, + V8U8 = 50, + A8Y8U8V8 = 51, + V8U8Y8A8 = 52, + Y8U8V8 = 53, + Y8V8U8 = 54, + U8V8Y8 = 55, + V8U8Y8 = 56, + Y8U8_Y8V8 = 57, + Y8V8_Y8U8 = 58, + U8Y8_V8Y8 = 59, + V8Y8_U8Y8 = 60, + Y8__U8V8_N444 = 61, + Y8__V8U8_N444 = 62, + Y8__U8V8_N422 = 63, + Y8__V8U8_N422 = 64, + Y8__U8V8_N422R = 65, + Y8__V8U8_N422R = 66, + Y8__U8V8_N420 = 67, + Y8__V8U8_N420 = 68, + Y8__U8__V8_N444 = 69, + Y8__U8__V8_N422 = 70, + Y8__U8__V8_N422R = 71, + Y8__U8__V8_N420 = 72, + U8 = 73, + V8 = 74, +}; + +struct Offset { + constexpr u32 Address() const noexcept { + return offset << 8; + } + +private: + u32 offset; +}; +static_assert(std::is_trivial_v, "Offset must be trivial"); +static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!"); + +struct PlaneOffsets { + Offset luma; + Offset chroma_u; + Offset chroma_v; +}; +static_assert(sizeof(PlaneOffsets) == 0xC, "PlaneOffsets has the wrong size!"); + +enum SurfaceIndex : u32 { + Current = 0, + Previous = 1, + Next = 2, + NextNoiseReduced = 3, + CurrentMotion = 4, + PreviousMotion = 5, + PreviousPreviousMotion = 6, + CombinedMotion = 7, +}; + +enum class DXVAHD_ALPHA_FILL_MODE : u32 { + OPAQUE = 0, + BACKGROUND = 1, + DESTINATION = 2, + SOURCE_STREAM = 3, + COMPOSITED = 4, + SOURCE_ALPHA = 5, +}; + +enum class DXVAHD_FRAME_FORMAT : u64 { + PROGRESSIVE = 0, + INTERLACED_TOP_FIELD_FIRST = 1, + INTERLACED_BOTTOM_FIELD_FIRST = 2, + TOP_FIELD = 3, + BOTTOM_FIELD = 4, + SUBPIC_PROGRESSIVE = 5, + SUBPIC_INTERLACED_TOP_FIELD_FIRST = 6, + SUBPIC_INTERLACED_BOTTOM_FIELD_FIRST = 7, + SUBPIC_TOP_FIELD = 8, + SUBPIC_BOTTOM_FIELD = 9, + TOP_FIELD_CHROMA_BOTTOM = 10, + BOTTOM_FIELD_CHROMA_TOP = 11, + SUBPIC_TOP_FIELD_CHROMA_BOTTOM = 12, + SUBPIC_BOTTOM_FIELD_CHROMA_TOP = 13, +}; + +enum class DXVAHD_DEINTERLACE_MODE_PRIVATE : u64 { + WEAVE = 0, + BOB_FIELD = 1, + BOB = 2, + NEWBOB = 3, + DISI1 = 4, + WEAVE_LUMA_BOB_FIELD_CHROMA = 5, + MAX = 0xF, +}; + +enum class BLK_KIND { + PITCH = 0, + GENERIC_16Bx2 = 1, + // These are unsupported in the vic + BL_NAIVE = 2, + BL_KEPLER_XBAR_RAW = 3, + VP2_TILED = 15, +}; + +enum class BLEND_SRCFACTC : u32 { + K1 = 0, + K1_TIMES_DST = 1, + NEG_K1_TIMES_DST = 2, + K1_TIMES_SRC = 3, + ZERO = 4, +}; + +enum class BLEND_DSTFACTC : u32 { + K1 = 0, + K2 = 1, + K1_TIMES_DST = 2, + NEG_K1_TIMES_DST = 3, + NEG_K1_TIMES_SRC = 4, + ZERO = 5, + ONE = 6, +}; + +enum class BLEND_SRCFACTA : u32 { + K1 = 0, + K2 = 1, + NEG_K1_TIMES_DST = 2, + ZERO = 3, + MAX = 7, +}; + +enum class BLEND_DSTFACTA : u32 { + K2 = 0, + NEG_K1_TIMES_SRC = 1, + ZERO = 2, + ONE = 3, + MAX = 7, +}; + +struct PipeConfig { + union { + BitField<0, 11, u32> downsample_horiz; + BitField<11, 5, u32> reserved0; + BitField<16, 11, u32> downsample_vert; + BitField<27, 5, u32> reserved1; + }; + u32 reserved2; + u32 reserved3; + u32 reserved4; +}; +static_assert(sizeof(PipeConfig) == 0x10, "PipeConfig has the wrong size!"); + +struct OutputConfig { + union { + BitField<0, 3, DXVAHD_ALPHA_FILL_MODE> alpha_fill_mode; + BitField<3, 3, u64> alpha_fill_slot; + BitField<6, 10, u64> background_a; + BitField<16, 10, u64> background_r; + BitField<26, 10, u64> background_g; + BitField<36, 10, u64> background_b; + BitField<46, 2, u64> regamma_mode; + BitField<48, 1, u64> output_flip_x; + BitField<49, 1, u64> output_flip_y; + BitField<50, 1, u64> output_transpose; + BitField<51, 1, u64> reserved1; + BitField<52, 12, u64> reserved2; + }; + union { + BitField<0, 14, u32> target_rect_left; + BitField<14, 2, u32> reserved3; + BitField<16, 14, u32> target_rect_right; + BitField<30, 2, u32> reserved4; + }; + union { + BitField<0, 14, u32> target_rect_top; + BitField<14, 2, u32> reserved5; + BitField<16, 14, u32> target_rect_bottom; + BitField<30, 2, u32> reserved6; + }; +}; +static_assert(sizeof(OutputConfig) == 0x10, "OutputConfig has the wrong size!"); + +struct OutputSurfaceConfig { + union { + BitField<0, 7, VideoPixelFormat> out_pixel_format; + BitField<7, 2, u32> out_chroma_loc_horiz; + BitField<9, 2, u32> out_chroma_loc_vert; + BitField<11, 4, BLK_KIND> out_block_kind; + BitField<15, 4, u32> out_block_height; // in gobs, log2 + BitField<19, 3, u32> reserved0; + BitField<22, 10, u32> reserved1; + }; + union { + BitField<0, 14, u32> out_surface_width; // - 1 + BitField<14, 14, u32> out_surface_height; // - 1 + BitField<28, 4, u32> reserved2; + }; + union { + BitField<0, 14, u32> out_luma_width; // - 1 + BitField<14, 14, u32> out_luma_height; // - 1 + BitField<28, 4, u32> reserved3; + }; + union { + BitField<0, 14, u32> out_chroma_width; // - 1 + BitField<14, 14, u32> out_chroma_height; // - 1 + BitField<28, 4, u32> reserved4; + }; +}; +static_assert(sizeof(OutputSurfaceConfig) == 0x10, "OutputSurfaceConfig has the wrong size!"); + +struct MatrixStruct { + union { + BitField<0, 20, s64> matrix_coeff00; // (0,0) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff10; // (1,0) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff20; // (2,0) of 4x3 conversion matrix + BitField<60, 4, u64> matrix_r_shift; + }; + union { + BitField<0, 20, s64> matrix_coeff01; // (0,1) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff11; // (1,1) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff21; // (2,1) of 4x3 conversion matrix + BitField<60, 3, u64> reserved0; + BitField<63, 1, u64> matrix_enable; + }; + union { + BitField<0, 20, s64> matrix_coeff02; // (0,2) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff12; // (1,2) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff22; // (2,2) of 4x3 conversion matrix + BitField<60, 4, u64> reserved1; + }; + union { + BitField<0, 20, s64> matrix_coeff03; // (0,3) of 4x3 conversion matrix + BitField<20, 20, s64> matrix_coeff13; // (1,3) of 4x3 conversion matrix + BitField<40, 20, s64> matrix_coeff23; // (2,3) of 4x3 conversion matrix + BitField<60, 4, u64> reserved2; + }; +}; +static_assert(sizeof(MatrixStruct) == 0x20, "MatrixStruct has the wrong size!"); + +struct ClearRectStruct { + union { + BitField<0, 14, u32> clear_rect0_left; + BitField<14, 2, u32> reserved0; + BitField<16, 14, u32> clear_rect0_right; + BitField<30, 2, u32> reserved1; + }; + union { + BitField<0, 14, u32> clear_rect0_top; + BitField<14, 2, u32> reserved2; + BitField<16, 14, u32> clear_rect0_bottom; + BitField<30, 2, u32> reserved3; + }; + union { + BitField<0, 14, u32> clear_rect1_left; + BitField<14, 2, u32> reserved4; + BitField<16, 14, u32> clear_rect1_right; + BitField<30, 2, u32> reserved5; + }; + union { + BitField<0, 14, u32> clear_rect1_top; + BitField<14, 2, u32> reserved6; + BitField<16, 14, u32> clear_rect1_bottom; + BitField<30, 2, u32> reserved7; + }; +}; +static_assert(sizeof(ClearRectStruct) == 0x10, "ClearRectStruct has the wrong size!"); + +struct SlotConfig { + union { + BitField<0, 1, u64> slot_enable; + BitField<1, 1, u64> denoise; + BitField<2, 1, u64> advanced_denoise; + BitField<3, 1, u64> cadence_detect; + BitField<4, 1, u64> motion_map; + BitField<5, 1, u64> motion_map_capture; + BitField<6, 1, u64> is_even; + BitField<7, 1, u64> chroma_even; + // fetch control struct + BitField<8, 1, u64> current_field_enable; + BitField<9, 1, u64> prev_field_enable; + BitField<10, 1, u64> next_field_enable; + BitField<11, 1, u64> next_nr_field_enable; // noise reduction + BitField<12, 1, u64> current_motion_field_enable; + BitField<13, 1, u64> prev_motion_field_enable; + BitField<14, 1, u64> prev_prev_motion_field_enable; + BitField<15, 1, u64> combined_motion_field_enable; + + BitField<16, 4, DXVAHD_FRAME_FORMAT> frame_format; + BitField<20, 2, u64> filter_length_y; // 0: 1-tap, 1: 2-tap, 2: 5-tap, 3: 10-tap + BitField<22, 2, u64> filter_length_x; + BitField<24, 12, u64> panoramic; + BitField<36, 22, u64> reserved1; + BitField<58, 6, u64> detail_filter_clamp; + }; + union { + BitField<0, 10, u64> filter_noise; + BitField<10, 10, u64> filter_detail; + BitField<20, 10, u64> chroma_noise; + BitField<30, 10, u64> chroma_detail; + BitField<40, 4, DXVAHD_DEINTERLACE_MODE_PRIVATE> deinterlace_mode; + BitField<44, 3, u64> motion_accumulation_weight; + BitField<47, 11, u64> noise_iir; + BitField<58, 4, u64> light_level; + BitField<62, 2, u64> reserved4; + }; + union { + BitField<0, 10, u64> soft_clamp_low; + BitField<10, 10, u64> soft_clamp_high; + BitField<20, 3, u64> reserved5; + BitField<23, 9, u64> reserved6; + BitField<32, 10, u64> planar_alpha; + BitField<42, 1, u64> constant_alpha; + BitField<43, 3, u64> stereo_interleave; + BitField<46, 1, u64> clip_enabled; + BitField<47, 8, u64> clear_rect_mask; + BitField<55, 2, u64> degamma_mode; + BitField<57, 1, u64> reserved7; + BitField<58, 1, u64> decompress_enable; + BitField<59, 5, u64> reserved9; + }; + union { + BitField<0, 8, u64> decompress_ctb_count; + BitField<8, 32, u64> decompress_zbc_count; + BitField<40, 24, u64> reserved12; + }; + union { + BitField<0, 30, u64> source_rect_left; + BitField<30, 2, u64> reserved14; + BitField<32, 30, u64> source_rect_right; + BitField<62, 2, u64> reserved15; + }; + union { + BitField<0, 30, u64> source_rect_top; + BitField<30, 2, u64> reserved16; + BitField<32, 30, u64> source_rect_bottom; + BitField<62, 2, u64> reserved17; + }; + union { + BitField<0, 14, u64> dest_rect_left; + BitField<14, 2, u64> reserved18; + BitField<16, 14, u64> dest_rect_right; + BitField<30, 2, u64> reserved19; + BitField<32, 14, u64> dest_rect_top; + BitField<46, 2, u64> reserved20; + BitField<48, 14, u64> dest_rect_bottom; + BitField<62, 2, u64> reserved21; + }; + u32 reserved22; + u32 reserved23; +}; +static_assert(sizeof(SlotConfig) == 0x40, "SlotConfig has the wrong size!"); + +struct SlotSurfaceConfig { + union { + BitField<0, 7, VideoPixelFormat> slot_pixel_format; + BitField<7, 2, u32> slot_chroma_loc_horiz; + BitField<9, 2, u32> slot_chroma_loc_vert; + BitField<11, 4, u32> slot_block_kind; + BitField<15, 4, u32> slot_block_height; + BitField<19, 3, u32> slot_cache_width; + BitField<22, 10, u32> reserved0; + }; + union { + BitField<0, 14, u32> slot_surface_width; // - 1 + BitField<14, 14, u32> slot_surface_height; // - 1 + BitField<28, 4, u32> reserved1; + }; + union { + BitField<0, 14, u32> slot_luma_width; // padded, - 1 + BitField<14, 14, u32> slot_luma_height; // padded, - 1 + BitField<28, 4, u32> reserved2; + }; + union { + BitField<0, 14, u32> slot_chroma_width; // padded, - 1 + BitField<14, 14, u32> slot_chroma_height; // padded, - 1 + BitField<28, 4, u32> reserved3; + }; +}; +static_assert(sizeof(SlotSurfaceConfig) == 0x10, "SlotSurfaceConfig has the wrong size!"); + +struct LumaKeyStruct { + union { + BitField<0, 20, u64> luma_coeff0; // (0) of 4x1 conversion matrix, S12.8 format + BitField<20, 20, u64> luma_coeff1; // (1) of 4x1 conversion matrix, S12.8 format + BitField<40, 20, u64> luma_coeff2; // (2) of 4x1 conversion matrix, S12.8 format + BitField<60, 4, u64> luma_r_shift; + }; + union { + BitField<0, 20, u64> luma_coeff3; // (3) of 4x1 conversion matrix, S12.8 format + BitField<20, 10, u64> luma_key_lower; + BitField<30, 10, u64> luma_key_upper; + BitField<40, 1, u64> luma_key_enabled; + BitField<41, 2, u64> reserved0; + BitField<43, 21, u64> reserved1; + }; +}; +static_assert(sizeof(LumaKeyStruct) == 0x10, "LumaKeyStruct has the wrong size!"); + +struct BlendingSlotStruct { + union { + BitField<0, 10, u32> alpha_k1; + BitField<10, 6, u32> reserved0; + BitField<16, 10, u32> alpha_k2; + BitField<26, 6, u32> reserved1; + }; + union { + BitField<0, 3, BLEND_SRCFACTC> src_factor_color_match_select; + BitField<3, 1, u32> reserved2; + BitField<4, 3, BLEND_DSTFACTC> dst_factor_color_match_select; + BitField<7, 1, u32> reserved3; + BitField<8, 3, BLEND_SRCFACTA> src_factor_a_match_select; + BitField<11, 1, u32> reserved4; + BitField<12, 3, BLEND_DSTFACTA> dst_factor_a_match_select; + BitField<15, 1, u32> reserved5; + BitField<16, 4, u32> reserved6; + BitField<20, 4, u32> reserved7; + BitField<24, 4, u32> reserved8; + BitField<28, 4, u32> reserved9; + }; + union { + BitField<0, 2, u32> reserved10; + BitField<2, 10, u32> override_r; + BitField<12, 10, u32> override_g; + BitField<22, 10, u32> override_b; + }; + union { + BitField<0, 10, u32> override_a; + BitField<10, 2, u32> reserved11; + BitField<12, 1, u32> use_override_r; + BitField<13, 1, u32> use_override_g; + BitField<14, 1, u32> use_override_b; + BitField<15, 1, u32> use_override_a; + BitField<16, 1, u32> mask_r; + BitField<17, 1, u32> mask_g; + BitField<18, 1, u32> mask_b; + BitField<19, 1, u32> mask_a; + BitField<20, 12, u32> reserved12; + }; +}; +static_assert(sizeof(BlendingSlotStruct) == 0x10, "BlendingSlotStruct has the wrong size!"); + +struct SlotStruct { + SlotConfig config; + SlotSurfaceConfig surface_config; + LumaKeyStruct luma_key; + MatrixStruct color_matrix; + MatrixStruct gamut_matrix; + BlendingSlotStruct blending; +}; +static_assert(sizeof(SlotStruct) == 0xB0, "SlotStruct has the wrong size!"); + +struct ConfigStruct { + PipeConfig pipe_config; + OutputConfig output_config; + OutputSurfaceConfig output_surface_config; + MatrixStruct out_color_matrix; + std::array clear_rects; + std::array slot_structs; +}; +static_assert(offsetof(ConfigStruct, pipe_config) == 0x0, "pipe_config is in the wrong place!"); +static_assert(offsetof(ConfigStruct, output_config) == 0x10, + "output_config is in the wrong place!"); +static_assert(offsetof(ConfigStruct, output_surface_config) == 0x20, + "output_surface_config is in the wrong place!"); +static_assert(offsetof(ConfigStruct, out_color_matrix) == 0x30, + "out_color_matrix is in the wrong place!"); +static_assert(offsetof(ConfigStruct, clear_rects) == 0x50, "clear_rects is in the wrong place!"); +static_assert(offsetof(ConfigStruct, slot_structs) == 0x90, "slot_structs is in the wrong place!"); +static_assert(sizeof(ConfigStruct) == 0x610, "ConfigStruct has the wrong size!"); + +struct VicRegisters { + static constexpr std::size_t NUM_REGS = 0x446; + + union { + struct { + INSERT_PADDING_WORDS_NOINIT(0xC0); + u32 execute; + INSERT_PADDING_WORDS_NOINIT(0x3F); + std::array, 8> surfaces; + u32 picture_index; + u32 control_params; + Offset config_struct_offset; + Offset filter_struct_offset; + Offset palette_offset; + Offset hist_offset; + u32 context_id; + u32 fce_ucode_size; + PlaneOffsets output_surface; + Offset fce_ucode_offset; + INSERT_PADDING_WORDS_NOINIT(0x4); + std::array slot_context_ids; + std::array comp_tag_buffer_offsets; + std::array history_buffer_offset; + INSERT_PADDING_WORDS_NOINIT(0x25D); + u32 pm_trigger_end; + }; + std::array reg_array; + }; +}; +static_assert(offsetof(VicRegisters, execute) == 0x300, "execute is in the wrong place!"); +static_assert(offsetof(VicRegisters, surfaces) == 0x400, "surfaces is in the wrong place!"); +static_assert(offsetof(VicRegisters, picture_index) == 0x700, + "picture_index is in the wrong place!"); +static_assert(offsetof(VicRegisters, control_params) == 0x704, + "control_params is in the wrong place!"); +static_assert(offsetof(VicRegisters, config_struct_offset) == 0x708, + "config_struct_offset is in the wrong place!"); +static_assert(offsetof(VicRegisters, output_surface) == 0x720, + "output_surface is in the wrong place!"); +static_assert(offsetof(VicRegisters, slot_context_ids) == 0x740, + "slot_context_ids is in the wrong place!"); +static_assert(offsetof(VicRegisters, history_buffer_offset) == 0x780, + "history_buffer_offset is in the wrong place!"); +static_assert(offsetof(VicRegisters, pm_trigger_end) == 0x1114, + "pm_trigger_end is in the wrong place!"); +static_assert(sizeof(VicRegisters) == 0x1118, "VicRegisters has the wrong size!"); + +class Vic final : public CDmaPusher { public: enum class Method : u32 { - Execute = 0xc0, - SetControlParams = 0x1c1, - SetConfigStructOffset = 0x1c2, - SetOutputSurfaceLumaOffset = 0x1c8, - SetOutputSurfaceChromaOffset = 0x1c9, - SetOutputSurfaceChromaUnusedOffset = 0x1ca + Execute = offsetof(VicRegisters, execute), + SetControlParams = offsetof(VicRegisters, control_params), + SetConfigStructOffset = offsetof(VicRegisters, config_struct_offset), + SetOutputSurfaceLumaOffset = offsetof(VicRegisters, output_surface.luma), + SetOutputSurfaceChromaOffset = offsetof(VicRegisters, output_surface.chroma_u), + SetOutputSurfaceChromaUnusedOffset = offsetof(VicRegisters, output_surface.chroma_v) }; - explicit Vic(Host1x& host1x, std::shared_ptr nvdec_processor); - + explicit Vic(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue); ~Vic(); /// Write to the device state. - void ProcessMethod(Method method, u32 argument); + void ProcessMethod(u32 method, u32 arg) override; private: void Execute(); - void WriteRGBFrame(std::unique_ptr frame, const VicConfig& config); + void Blend(const ConfigStruct& config, const SlotStruct& slot); - void WriteYUVFrame(std::unique_ptr frame, const VicConfig& config); + template + void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame); + template + void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame); - Host1x& host1x; - std::shared_ptr nvdec_processor; + template + void ReadY8__V8U8_N420(const SlotStruct& slot, std::span offsets, + std::shared_ptr frame); - /// Avoid reallocation of the following buffers every frame, as their - /// size does not change during a stream - using AVMallocPtr = std::unique_ptr; - AVMallocPtr converted_frame_buffer; - Common::ScratchBuffer luma_buffer; - Common::ScratchBuffer chroma_buffer; + void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config); - GPUVAddr config_struct_address{}; - GPUVAddr output_surface_luma_address{}; - GPUVAddr output_surface_chroma_address{}; + template + void WriteABGR(const OutputSurfaceConfig& output_surface_config); - SwsContext* scaler_ctx{}; - s32 scaler_width{}; - s32 scaler_height{}; + s32 id; + s32 nvdec_id{-1}; + u32 syncpoint; + + VicRegisters regs{}; + FrameQueue& frame_queue; + + const bool has_sse41{false}; + + Common::ScratchBuffer output_surface; + Common::ScratchBuffer slot_surface; + Common::ScratchBuffer luma_scratch; + Common::ScratchBuffer chroma_scratch; + Common::ScratchBuffer swizzle_scratch; }; -} // namespace Host1x - -} // namespace Tegra +} // namespace Tegra::Host1x diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index ac7c1472a..448624aa9 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -43,6 +43,8 @@ public: u64 big_page_bits_ = 16, u64 page_bits_ = 12); ~MemoryManager(); + static constexpr bool HAS_FLUSH_INVALIDATION = true; + size_t GetID() const { return unique_identifier; } diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 5fb54635d..452af2787 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -71,7 +71,7 @@ const char* GetType(GLenum type) { void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar* message, const void* user_param) { - const char format[] = "{} {} {}: {}"; + constexpr std::string_view format = "{} {} {}: {}"; const char* const str_source = GetSource(source); const char* const str_type = GetType(type); diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index b94924a58..9e01155b1 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -40,6 +40,12 @@ constexpr std::array DEPTH24_UNORM_STENCIL8_UINT{ VK_FORMAT_UNDEFINED, }; +constexpr std::array DEPTH24_UNORM_DONTCARE8{ + VK_FORMAT_D32_SFLOAT, + VK_FORMAT_D16_UNORM, + VK_FORMAT_UNDEFINED, +}; + constexpr std::array DEPTH16_UNORM_STENCIL8_UINT{ VK_FORMAT_D24_UNORM_S8_UINT, VK_FORMAT_D32_SFLOAT_S8_UINT, @@ -95,6 +101,8 @@ constexpr const VkFormat* GetFormatAlternatives(VkFormat format) { return Alternatives::STENCIL8_UINT.data(); case VK_FORMAT_D24_UNORM_S8_UINT: return Alternatives::DEPTH24_UNORM_STENCIL8_UINT.data(); + case VK_FORMAT_X8_D24_UNORM_PACK32: + return Alternatives::DEPTH24_UNORM_DONTCARE8.data(); case VK_FORMAT_D16_UNORM_S8_UINT: return Alternatives::DEPTH16_UNORM_STENCIL8_UINT.data(); case VK_FORMAT_B5G6R5_UNORM_PACK16: diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index 16b74ad92..25aa03934 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -1461,7 +1461,6 @@ void GMainWindow::OnAppFocusStateChanged(Qt::ApplicationState state) { OnPauseGame(); } else if (!emu_thread->IsRunning() && auto_paused && state == Qt::ApplicationActive) { auto_paused = false; - RequestGameResume(); OnStartGame(); } } @@ -1703,7 +1702,6 @@ void GMainWindow::OnPrepareForSleep(bool prepare_sleep) { } else { if (!emu_thread->IsRunning() && auto_paused) { auto_paused = false; - RequestGameResume(); OnStartGame(); } } @@ -3458,7 +3456,6 @@ void GMainWindow::OnPauseContinueGame() { if (emu_thread->IsRunning()) { OnPauseGame(); } else { - RequestGameResume(); OnStartGame(); } } @@ -5037,10 +5034,6 @@ void GMainWindow::RequestGameExit() { system->GetAppletManager().RequestExit(); } -void GMainWindow::RequestGameResume() { - system->GetAppletManager().RequestResume(); -} - void GMainWindow::filterBarSetChecked(bool state) { ui->action_Show_Filter_Bar->setChecked(state); emit(OnToggleFilterBar()); diff --git a/src/yuzu/main.h b/src/yuzu/main.h index b88352371..aaade93a8 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -310,7 +310,6 @@ private: bool ConfirmChangeGame(); bool ConfirmForceLockedExit(); void RequestGameExit(); - void RequestGameResume(); void changeEvent(QEvent* event) override; void closeEvent(QCloseEvent* event) override;