<C++> SSE指令集
SSE指令集
include库
#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h> //所有版本(include immintrin.h)
基本操作
- 使用SSE专门的LOAD指令将数据从内存加载一个向量到寄存器;
- 使用SSE专门的OP指令对两个向量进行某种计算;
- 使用SSE专门的STORE指令把计算结果从寄存器写回到内存;
数据类型
- __m128表示128bit的单精度浮点数
typedef union __declspec(intrin_type) __declspec(align(16)) __m128 {float m128_f32[4];unsigned __int64 m128_u64[2];__int8 m128_i8[16];__int16 m128_i16[8];__int32 m128_i32[4];__int64 m128_i64[2];unsigned __int8 m128_u8[16];unsigned __int16 m128_u16[8];unsigned __int32 m128_u32[4];} __m128;
- __m128i表示128bit的整数型
typedef union __declspec(intrin_type) __declspec(align(16)) __m128i {__int8 m128i_i8[16];__int16 m128i_i16[8];__int32 m128i_i32[4];__int64 m128i_i64[2];unsigned __int8 m128i_u8[16];unsigned __int16 m128i_u16[8];unsigned __int32 m128i_u32[4];unsigned __int64 m128i_u64[2];
} __m128i;
- __128d表示128bit的双精度浮点数
typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {double m128d_f64[2];
} __m128d;
指令函数命名
SSE指令的函数从命名上,主要分成三部分,以_mm_loadu_pd为例:
- 第一部分均以_mm开头,表示属于SSE指令集,_mm256或_mm512是AVX或AVX-512指令集的Intrinsic函数前缀;
- 第二部分表明操作类型,比如load,add,store等。但部分指令后面跟有[l|h|u|r]等字母,比如u表示mem_addr不需要内存对齐,r表示反向读取等;
- 第三部分为操作的对象名及数据类型:
_ps:packed操作所有的单精度浮点数;
_pd:packed操作所有的双精度浮点数;
_pixx:(xx为长度,可以是8,16,32,64)packed操作所有的xx位有符号整数,使用的寄存器长度为64位;
_epixx:(xx为长度)packed操作所有的xx位的有符号整数,使用的寄存器长度为128位;
_epuxx: packed操作所有的xx位的无符号整数;
_ss:scalar操作第一个单精度浮点数;
p表示packed即对128bits的数据全部执行相同的操作,s表示scalar,只对128bits中的第一组数据执行操作,如下图所示。
1、load加载
__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
__m128d _mm_load_pd (double const* mem_addr)
__m128d _mm_load_pd1 (double const* mem_addr)
__m128 _mm_load_ps (float const* mem_addr)
__m128 _mm_load_ps1 (float const* mem_addr)
__m128d _mm_load_sd (double const* mem_addr)
__m128i _mm_load_si128 (__m128i const* mem_addr)
__m128 _mm_load_ss (float const* mem_addr)
__m128d _mm_load1_pd (double const* mem_addr)
__m128 _mm_load1_ps (float const* mem_addr)
__m128d _mm_loaddup_pd (double const* mem_addr)
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
__m128d _mm_loadr_pd (double const* mem_addr)
__m128 _mm_loadr_ps (float const* mem_addr)
__m128d _mm_loadu_pd (double const* mem_addr)
__m128 _mm_loadu_ps (float const* mem_addr)
__m128i _mm_loadu_si128 (__m128i const* mem_addr)
__m128i _mm_loadu_si16 (void const* mem_addr)
__m128i _mm_loadu_si32 (void const* mem_addr)
__m128i _mm_loadu_si64 (void const* mem_addr)
2、OP操作
Arithmetic算术
__m128i _mm_add_epi16 (__m128i a, __m128i b)
__m128i _mm_add_epi32 (__m128i a, __m128i b)
__m128i _mm_add_epi64 (__m128i a, __m128i b)
__m128i _mm_add_epi8 (__m128i a, __m128i b)
__m128d _mm_add_pd (__m128d a, __m128d b)
__m128 _mm_add_ps (__m128 a, __m128 b)
__m128d _mm_add_sd (__m128d a, __m128d b)
__m64 _mm_add_si64 (__m64 a, __m64 b)
__m128 _mm_add_ss (__m128 a, __m128 b)
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
__m128i _mm_adds_epu8 (__m128i a, __m128i b)
__m128d _mm_addsub_pd (__m128d a, __m128d b)
__m128 _mm_addsub_ps (__m128 a, __m128 b)
__m128d _mm_div_pd (__m128d a, __m128d b)
__m128 _mm_div_ps (__m128 a, __m128 b)
__m128d _mm_div_sd (__m128d a, __m128d b)
__m128 _mm_div_ss (__m128 a, __m128 b)
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
__m128d _mm_hadd_pd (__m128d a, __m128d b)
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
__m128 _mm_hadd_ps (__m128 a, __m128 b)
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
__m128d _mm_hsub_pd (__m128d a, __m128d b)
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
__m128 _mm_hsub_ps (__m128 a, __m128 b)
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
__m128i _mm_madd_epi16 (__m128i a, __m128i b)
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
__m128i _mm_mul_epi32 (__m128i a, __m128i b)
__m128i _mm_mul_epu32 (__m128i a, __m128i b)
__m128d _mm_mul_pd (__m128d a, __m128d b)
__m128 _mm_mul_ps (__m128 a, __m128 b)
__m128d _mm_mul_sd (__m128d a, __m128d b)
__m128 _mm_mul_ss (__m128 a, __m128 b)
__m64 _mm_mul_su32 (__m64 a, __m64 b)
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
__m64 _m_pmulhuw (__m64 a, __m64 b)
__m64 _m_psadbw (__m64 a, __m64 b)
__m128i _mm_sad_epu8 (__m128i a, __m128i b)
__m64 _mm_sad_pu8 (__m64 a, __m64 b)
__m128i _mm_sign_epi16 (__m128i a, __m128i b)
__m128i _mm_sign_epi32 (__m128i a, __m128i b)
__m128i _mm_sign_epi8 (__m128i a, __m128i b)
__m64 _mm_sign_pi16 (__m64 a, __m64 b)
__m64 _mm_sign_pi32 (__m64 a, __m64 b)
__m64 _mm_sign_pi8 (__m64 a, __m64 b)
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
__m128i _mm_sub_epi32 (__m128i a, __m128i b)
__m128i _mm_sub_epi64 (__m128i a, __m128i b)
__m128i _mm_sub_epi8 (__m128i a, __m128i b)
__m128d _mm_sub_pd (__m128d a, __m128d b)
__m128 _mm_sub_ps (__m128 a, __m128 b)
__m128d _mm_sub_sd (__m128d a, __m128d b)
__m64 _mm_sub_si64 (__m64 a, __m64 b)
__m128 _mm_sub_ss (__m128 a, __m128 b)
__m128i _mm_subs_epi16 (__m128i a, __m128i b)
__m128i _mm_subs_epi8 (__m128i a, __m128i b)
__m128i _mm_subs_epu16 (__m128i a, __m128i b)
__m128i _mm_subs_epu8 (__m128i a, __m128i b)
Compare比较
__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
__m128d _mm_cmpge_pd (__m128d a, __m128d b)
__m128 _mm_cmpge_ps (__m128 a, __m128 b)
__m128d _mm_cmpge_sd (__m128d a, __m128d b)
__m128 _mm_cmpge_ss (__m128 a, __m128 b)
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
__m128d _mm_cmple_pd (__m128d a, __m128d b)
__m128 _mm_cmple_ps (__m128 a, __m128 b)
__m128d _mm_cmple_sd (__m128d a, __m128d b)
__m128 _mm_cmple_ss (__m128 a, __m128 b)
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmplt_pd (__m128d a, __m128d b)
__m128 _mm_cmplt_ps (__m128 a, __m128 b)
__m128d _mm_cmplt_sd (__m128d a, __m128d b)
__m128 _mm_cmplt_ss (__m128 a, __m128 b)
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
__m128d _mm_cmpord_pd (__m128d a, __m128d b)
__m128 _mm_cmpord_ps (__m128 a, __m128 b)
__m128d _mm_cmpord_sd (__m128d a, __m128d b)
__m128 _mm_cmpord_ss (__m128 a, __m128 b)
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
int _mm_comieq_sd (__m128d a, __m128d b)
int _mm_comieq_ss (__m128 a, __m128 b)
int _mm_comige_sd (__m128d a, __m128d b)
int _mm_comige_ss (__m128 a, __m128 b)
int _mm_comigt_sd (__m128d a, __m128d b)
int _mm_comigt_ss (__m128 a, __m128 b)
int _mm_comile_sd (__m128d a, __m128d b)
int _mm_comile_ss (__m128 a, __m128 b)
int _mm_comilt_sd (__m128d a, __m128d b)
int _mm_comilt_ss (__m128 a, __m128 b)
int _mm_comineq_sd (__m128d a, __m128d b)
int _mm_comineq_ss (__m128 a, __m128 b)
int _mm_ucomieq_sd (__m128d a, __m128d b)
int _mm_ucomieq_ss (__m128 a, __m128 b)
int _mm_ucomige_sd (__m128d a, __m128d b)
int _mm_ucomige_ss (__m128 a, __m128 b)
int _mm_ucomigt_sd (__m128d a, __m128d b)
int _mm_ucomigt_ss (__m128 a, __m128 b)
int _mm_ucomile_sd (__m128d a, __m128d b)
int _mm_ucomile_ss (__m128 a, __m128 b)
int _mm_ucomilt_sd (__m128d a, __m128d b)
int _mm_ucomilt_ss (__m128 a, __m128 b)
int _mm_ucomineq_sd (__m128d a, __m128d b)
int _mm_ucomineq_ss (__m128 a, __m128 b)
Convert转换
__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
__m64 _mm_cvt_ps2pi (__m128 a)
__m128 _mm_cvt_si2ss (__m128 a, int b)
int _mm_cvt_ss2si (__m128 a)
__m128i _mm_cvtepi16_epi32 (__m128i a)
__m128i _mm_cvtepi16_epi64 (__m128i a)
__m128i _mm_cvtepi32_epi64 (__m128i a)
__m128d _mm_cvtepi32_pd (__m128i a)
__m128 _mm_cvtepi32_ps (__m128i a)
__m128i _mm_cvtepi8_epi16 (__m128i a)
__m128i _mm_cvtepi8_epi32 (__m128i a)
__m128i _mm_cvtepi8_epi64 (__m128i a)
__m128i _mm_cvtepu16_epi32 (__m128i a)
__m128i _mm_cvtepu16_epi64 (__m128i a)
__m128i _mm_cvtepu32_epi64 (__m128i a)
__m128i _mm_cvtepu8_epi16 (__m128i a)
__m128i _mm_cvtepu8_epi32 (__m128i a)
__m128i _mm_cvtepu8_epi64 (__m128i a)
__m128i _mm_cvtpd_epi32 (__m128d a)
__m64 _mm_cvtpd_pi32 (__m128d a)
__m128 _mm_cvtpd_ps (__m128d a)
__m128 _mm_cvtpi16_ps (__m64 a)
__m128d _mm_cvtpi32_pd (__m64 a)
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
__m128 _mm_cvtpi8_ps (__m64 a)
__m128i _mm_cvtps_epi32 (__m128 a)
__m128d _mm_cvtps_pd (__m128 a)
__m64 _mm_cvtps_pi16 (__m128 a)
__m64 _mm_cvtps_pi32 (__m128 a)
__m64 _mm_cvtps_pi8 (__m128 a)
__m128 _mm_cvtpu16_ps (__m64 a)
__m128 _mm_cvtpu8_ps (__m64 a)
double _mm_cvtsd_f64 (__m128d a)
int _mm_cvtsd_si32 (__m128d a)
__int64 _mm_cvtsd_si64 (__m128d a)
__int64 _mm_cvtsd_si64x (__m128d a)
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
int _mm_cvtsi128_si32 (__m128i a)
__int64 _mm_cvtsi128_si64 (__m128i a)
__int64 _mm_cvtsi128_si64x (__m128i a)
__m128d _mm_cvtsi32_sd (__m128d a, int b)
__m128i _mm_cvtsi32_si128 (int a)
__m128 _mm_cvtsi32_ss (__m128 a, int b)
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64_si128 (__int64 a)
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64x_si128 (__int64 a)
float _mm_cvtss_f32 (__m128 a)
__m128d _mm_cvtss_sd (__m128d a, __m128 b)
int _mm_cvtss_si32 (__m128 a)
__int64 _mm_cvtss_si64 (__m128 a)
__m64 _mm_cvtt_ps2pi (__m128 a)
int _mm_cvtt_ss2si (__m128 a)
__m128i _mm_cvttpd_epi32 (__m128d a)
__m64 _mm_cvttpd_pi32 (__m128d a)
__m128i _mm_cvttps_epi32 (__m128 a)
__m64 _mm_cvttps_pi32 (__m128 a)
int _mm_cvttsd_si32 (__m128d a)
__int64 _mm_cvttsd_si64 (__m128d a)
__int64 _mm_cvttsd_si64x (__m128d a)
int _mm_cvttss_si32 (__m128 a)
__int64 _mm_cvttss_si64 (__m128 a)
__m128i _mm_packus_epi32 (__m128i a, __m128i b)
Logical逻辑
__m128d _mm_and_pd (__m128d a, __m128d b)
__m128 _mm_and_ps (__m128 a, __m128 b)
__m128i _mm_and_si128 (__m128i a, __m128i b)
__m128d _mm_andnot_pd (__m128d a, __m128d b)
__m128 _mm_andnot_ps (__m128 a, __m128 b)
__m128i _mm_andnot_si128 (__m128i a, __m128i b)
__m128d _mm_or_pd (__m128d a, __m128d b)
__m128 _mm_or_ps (__m128 a, __m128 b)
__m128i _mm_or_si128 (__m128i a, __m128i b)
int _mm_test_all_ones (__m128i a)
int _mm_test_all_zeros (__m128i mask, __m128i a)
int _mm_test_mix_ones_zeros (__m128i mask, __m128i a)
int _mm_testc_si128 (__m128i a, __m128i b)
int _mm_testnzc_si128 (__m128i a, __m128i b)
int _mm_testz_si128 (__m128i a, __m128i b)
__m128d _mm_xor_pd (__m128d a, __m128d b)
__m128 _mm_xor_ps (__m128 a, __m128 b)
__m128i _mm_xor_si128 (__m128i a, __m128i b)
Set设置
__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_set_pd (double e1, double e0)
__m128d _mm_set_pd1 (double a)
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
__m128 _mm_set_ps1 (float a)
__m128d _mm_set_sd (double a)
__m128 _mm_set_ss (float a)
__m128i _mm_set1_epi16 (short a)
__m128i _mm_set1_epi32 (int a)
__m128i _mm_set1_epi64 (__m64 a)
__m128i _mm_set1_epi64x (__int64 a)
__m128i _mm_set1_epi8 (char a)
__m128d _mm_set1_pd (double a)
__m128 _mm_set1_ps (float a)
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_setr_pd (double e1, double e0)
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
__m128d _mm_setzero_pd (void)
__m128 _mm_setzero_ps (void)
__m128i _mm_setzero_si128 ()
3、Store存储
void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
void _mm_store_pd (double* mem_addr, __m128d a)
void _mm_store_pd1 (double* mem_addr, __m128d a)
void _mm_store_ps (float* mem_addr, __m128 a)
void _mm_store_ps1 (float* mem_addr, __m128 a)
void _mm_store_sd (double* mem_addr, __m128d a)
void _mm_store_si128 (__m128i* mem_addr, __m128i a)
void _mm_store_ss (float* mem_addr, __m128 a)
void _mm_store1_pd (double* mem_addr, __m128d a)
void _mm_store1_ps (float* mem_addr, __m128 a)
void _mm_storeh_pd (double* mem_addr, __m128d a)
void _mm_storeh_pi (__m64* mem_addr, __m128 a)
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
void _mm_storel_pd (double* mem_addr, __m128d a)
void _mm_storel_pi (__m64* mem_addr, __m128 a)
void _mm_storer_pd (double* mem_addr, __m128d a)
void _mm_storer_ps (float* mem_addr, __m128 a)
void _mm_storeu_pd (double* mem_addr, __m128d a)
void _mm_storeu_ps (float* mem_addr, __m128 a)
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
void _mm_storeu_si16 (void* mem_addr, __m128i a)
void _mm_storeu_si32 (void* mem_addr, __m128i a)
void _mm_storeu_si64 (void* mem_addr, __m128i a)
void _mm_stream_pd (double* mem_addr, __m128d a)
void _mm_stream_pi (__m64* mem_addr, __m64 a)
void _mm_stream_ps (float* mem_addr, __m128 a)
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
void _mm_stream_si32 (int* mem_addr, int a)
void _mm_stream_si64 (__int64* mem_addr, __int64 a)
参考
1、https://www.zhihu.com/column/c_1550937293912748032
2、https://zhuanlan.zhihu.com/p/409973153
3、https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=4880,3865,6557&techs=SSE_ALL
相关文章:

<C++> SSE指令集
SSE指令集 include库 #include <mmintrin.h> //MMX #include <xmmintrin.h> //SSE(include mmintrin.h) #include <emmintrin.h> //SSE2(include xmmintrin.h) #include <pmmintrin.h> //SSE3(include emmintrin.h) #include <tmmintrin.h> /…...

cortex-A7核LED灯实验--STM32MP157
实验目的:实现LED1 / LED2 / LED3三盏灯工作 一,分析电路图 1,思路 分析电路图可知: 网络编号 引脚编号 LED1 PE10 LED2 > PF10 LED3 > PE8 2,工作原理: 写1:LED灯亮…...

WPF实战项目十三(API篇):备忘录功能api接口、优化待办事项api接口
1、新建MenoDto.cs /// <summary>/// 备忘录传输实体/// </summary>public class MenoDto : BaseDto{private string title;/// <summary>/// 标题/// </summary>public string Title{get { return title; }set { title value; }}private string con…...

clickhouse(十四、分布式DDL阻塞及同步阻塞问题)
文章目录 一、分布式ddl 阻塞、超时现象验证方法解决方案 二、副本同步阻塞现象验证解决方案 一、分布式ddl 阻塞、超时 现象 在clickhouse 集群的操作中,如果同时执行一些重量级变更语句,往往会引起阻塞。 一般是由于节点堆积过多耗时的ddl。然后抛出…...

怎么入门网络安全(黑客)?
目录: 一、自学网络安全学习的误区和陷阱 1.不要试图先成为一名程序员(以编程为基础的学习)再开始学习2.不要把深度学习作为入门第一课3.以黑客技能、兴趣为方向的自学误区:4.不要收集过多的资料二、学习网络安全的一些前期准备三…...

c++ boost::json
Boost社区12月11日发布了1.75版本,在之前,Boost使用Boost.PropertyTree解析JSON,XML,INI和INFO格式的文件。但是由于成文较早及需要兼容其他的数据格式,相比较于其他的…...

《Flink学习笔记》——第九章 多流转换
无论是基本的简单转换和聚合,还是基于窗口的计算,我们都是针对一条流上的数据进行处理的。而在实际应用中,可能需要将不同来源的数据连接合并在一起处理,也有可能需要将一条流拆分开,所以经常会有对多条流进行处理的场…...
openmmlab出现KeyError: ‘xxx is not in the model registry....‘
问题描述 在复现基于mmpose框架的算法时,运行程序出现KeyError: xxx is not in the model registry....的问题,报错原因是自定义的backbone等结构或者某些当前代码使用的方法没有注册到现有的包中, 导致在import的时候无法导入该方法。 解决方案 找到…...

错误代码0x80131500要怎么解决?快速修复方法
错误代码0x80131500通常与.NET Framework 相关的问题有关。它可能表示.NET Framework的安装损坏、版本冲突或系统文件缺失等。下面我们一起来探讨一下解决错误代码0x80131500有哪些。 以下是一些解决方法 安装最新的.NET Framework版本:访问Microsoft官方网站&…...
PMO(Project Management Office)
PMO 是项目管理办公室(Project Management Office)的缩写。它是组织内的一个部门或团队,负责支持和促进项目管理活动,以确保项目按时、按预算、按要求完成。 PMO 的职责和角色可以因组织的性质和需求而有所不同,但通常…...

STM32 CUBEMX CAN通信数据发送失败原因分析
CAN通信是一种数据通信协议,用于在不同设备之间进行通信。它是一种高效的、实时的、可靠的、多主机的、串行通信系统,通常用于汽车电子、工业自动化等领域。CAN通信协议是由德国BOSCH公司于1986年引入,并在欧洲和日本广泛使用。CAN通信具有独…...

长安链并行调度机制(2):DAG构建和从节点执行流程
长安链采用高效的并行调度方式执行交易,了解长安链交易调度、冲突检测和DAG构建流程有助于开发者更好地理解长安链并行调度的运行机制,帮助开发者编写高质量、低冲突的智能合约,更好地构建区块链应用。 上一篇内容我们说明了长安链交易调度、…...
leetcode做题笔记110. 平衡二叉树
给定一个二叉树,判断它是否是高度平衡的二叉树。 本题中,一棵高度平衡二叉树定义为: 一个二叉树每个节点 的左右两个子树的高度差的绝对值不超过 1 。 思路一:递归 int height(struct TreeNode* root) {if (root NULL) {return…...
iOS开发Swift-字符串与字符
1.字符串的定义 let someString "some string value"2.多行字符串的定义(""") let quotation """ 有一个人前来买瓜。 "这瓜甜吗?"他问。 """前一个"""前和后一个""&…...
Linux Kernel:syscall之fork与exec
环境: Kernel Version:Linux-5.10 ARCH:ARM64 一:前言 上一节我们提到了进程的产生方式fork,exec与clone,本节将详细分析fork和exec族系统调用的具体实现。通常这些调用不是由应用程序直接发出的,而是通过一个中间层调用,即负责与内核通信的C标准库。从用户状态切换到…...
CentOS 修改MySQL密码
CentOS 修改MySQL密码 1.登录MySQL 2.执行如下命令 update user set passwordpassword(mivbAs7Awc) where userroot;报错如下: Unknown column ‘password’ in ‘field list’ 3.执行如下命令 update user set passwordpassword(mivbAs7Awc) where userroot碰到…...
Android通过setaffinity实现绑核
有时候为了降低App算力占用,会把关键的线程绑定到大核中,下面介绍一种绑核的方式 查看绑核 查看pid :/ # ps -A | grep test u0_a15 25178 405 15950272 176544 do_epoll_wait 0 S com.test.jnites查看线程号 top -H -p 25178 25224 u0_…...

stm32的位带操作
在51单片机中,我们可以使用P2^1来对单片机的某一位进行操作,到了stm32,我们通过位带操作,将寄存器的每一位映射到一个32位的地址。如下是我查资料摘录的一些图片。 映射方式 SRAM: AliasAddr 0x22000000 (A-0X20000000)*8*4n*4…...

Java 电子招标采购系统源码:营造全面规范安全的电子招投标环境,促进招投标市场健康可持续发展
营造全面规范安全的电子招投标环境,促进招投标市场健康可持续发展 传统采购模式面临的挑战 一、立项管理 1、招标立项申请 功能点:招标类项目立项申请入口,用户可以保存为草稿,提交。 2、非招标立项申请 功能点:非招标…...
https协议经过SpringMVC重定向之后变成http协议
之前项目的协议还是http,当改为https之后,就出现了这个问题。 服务访问地址:https://wuxinke.demo.com 访问某个页面的地址:https://wuxinke.demo.com/aps/judgeProviderOrCtenant.ht 经SpringMVC重定向之后,地址变…...

龙虎榜——20250610
上证指数放量收阴线,个股多数下跌,盘中受消息影响大幅波动。 深证指数放量收阴线形成顶分型,指数短线有调整的需求,大概需要一两天。 2025年6月10日龙虎榜行业方向分析 1. 金融科技 代表标的:御银股份、雄帝科技 驱动…...

多云管理“拦路虎”:深入解析网络互联、身份同步与成本可视化的技术复杂度
一、引言:多云环境的技术复杂性本质 企业采用多云策略已从技术选型升维至生存刚需。当业务系统分散部署在多个云平台时,基础设施的技术债呈现指数级积累。网络连接、身份认证、成本管理这三大核心挑战相互嵌套:跨云网络构建数据…...

Flask RESTful 示例
目录 1. 环境准备2. 安装依赖3. 修改main.py4. 运行应用5. API使用示例获取所有任务获取单个任务创建新任务更新任务删除任务 中文乱码问题: 下面创建一个简单的Flask RESTful API示例。首先,我们需要创建环境,安装必要的依赖,然后…...
多场景 OkHttpClient 管理器 - Android 网络通信解决方案
下面是一个完整的 Android 实现,展示如何创建和管理多个 OkHttpClient 实例,分别用于长连接、普通 HTTP 请求和文件下载场景。 <?xml version"1.0" encoding"utf-8"?> <LinearLayout xmlns:android"http://schemas…...

《用户共鸣指数(E)驱动品牌大模型种草:如何抢占大模型搜索结果情感高地》
在注意力分散、内容高度同质化的时代,情感连接已成为品牌破圈的关键通道。我们在服务大量品牌客户的过程中发现,消费者对内容的“有感”程度,正日益成为影响品牌传播效率与转化率的核心变量。在生成式AI驱动的内容生成与推荐环境中࿰…...
ffmpeg(四):滤镜命令
FFmpeg 的滤镜命令是用于音视频处理中的强大工具,可以完成剪裁、缩放、加水印、调色、合成、旋转、模糊、叠加字幕等复杂的操作。其核心语法格式一般如下: ffmpeg -i input.mp4 -vf "滤镜参数" output.mp4或者带音频滤镜: ffmpeg…...
Caliper 配置文件解析:config.yaml
Caliper 是一个区块链性能基准测试工具,用于评估不同区块链平台的性能。下面我将详细解释你提供的 fisco-bcos.json 文件结构,并说明它与 config.yaml 文件的关系。 fisco-bcos.json 文件解析 这个文件是针对 FISCO-BCOS 区块链网络的 Caliper 配置文件,主要包含以下几个部…...

推荐 github 项目:GeminiImageApp(图片生成方向,可以做一定的素材)
推荐 github 项目:GeminiImageApp(图片生成方向,可以做一定的素材) 这个项目能干嘛? 使用 gemini 2.0 的 api 和 google 其他的 api 来做衍生处理 简化和优化了文生图和图生图的行为(我的最主要) 并且有一些目标检测和切割(我用不到) 视频和 imagefx 因为没 a…...

【C++特殊工具与技术】优化内存分配(一):C++中的内存分配
目录 一、C 内存的基本概念 1.1 内存的物理与逻辑结构 1.2 C 程序的内存区域划分 二、栈内存分配 2.1 栈内存的特点 2.2 栈内存分配示例 三、堆内存分配 3.1 new和delete操作符 4.2 内存泄漏与悬空指针问题 4.3 new和delete的重载 四、智能指针…...
提升移动端网页调试效率:WebDebugX 与常见工具组合实践
在日常移动端开发中,网页调试始终是一个高频但又极具挑战的环节。尤其在面对 iOS 与 Android 的混合技术栈、各种设备差异化行为时,开发者迫切需要一套高效、可靠且跨平台的调试方案。过去,我们或多或少使用过 Chrome DevTools、Remote Debug…...