<C++> SSE指令集
SSE指令集
include库
#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h> //所有版本(include immintrin.h)
基本操作
- 使用SSE专门的LOAD指令将数据从内存加载一个向量到寄存器;
- 使用SSE专门的OP指令对两个向量进行某种计算;
- 使用SSE专门的STORE指令把计算结果从寄存器写回到内存;
数据类型
- __m128表示128bit的单精度浮点数
typedef union __declspec(intrin_type) __declspec(align(16)) __m128 {float m128_f32[4];unsigned __int64 m128_u64[2];__int8 m128_i8[16];__int16 m128_i16[8];__int32 m128_i32[4];__int64 m128_i64[2];unsigned __int8 m128_u8[16];unsigned __int16 m128_u16[8];unsigned __int32 m128_u32[4];} __m128;
- __m128i表示128bit的整数型
typedef union __declspec(intrin_type) __declspec(align(16)) __m128i {__int8 m128i_i8[16];__int16 m128i_i16[8];__int32 m128i_i32[4];__int64 m128i_i64[2];unsigned __int8 m128i_u8[16];unsigned __int16 m128i_u16[8];unsigned __int32 m128i_u32[4];unsigned __int64 m128i_u64[2];
} __m128i;
- __128d表示128bit的双精度浮点数
typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {double m128d_f64[2];
} __m128d;
指令函数命名
SSE指令的函数从命名上,主要分成三部分,以_mm_loadu_pd为例:
- 第一部分均以_mm开头,表示属于SSE指令集,_mm256或_mm512是AVX或AVX-512指令集的Intrinsic函数前缀;
- 第二部分表明操作类型,比如load,add,store等。但部分指令后面跟有[l|h|u|r]等字母,比如u表示mem_addr不需要内存对齐,r表示反向读取等;
- 第三部分为操作的对象名及数据类型:
_ps:packed操作所有的单精度浮点数;
_pd:packed操作所有的双精度浮点数;
_pixx:(xx为长度,可以是8,16,32,64)packed操作所有的xx位有符号整数,使用的寄存器长度为64位;
_epixx:(xx为长度)packed操作所有的xx位的有符号整数,使用的寄存器长度为128位;
_epuxx: packed操作所有的xx位的无符号整数;
_ss:scalar操作第一个单精度浮点数;
p表示packed即对128bits的数据全部执行相同的操作,s表示scalar,只对128bits中的第一组数据执行操作,如下图所示。

1、load加载
__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
__m128d _mm_load_pd (double const* mem_addr)
__m128d _mm_load_pd1 (double const* mem_addr)
__m128 _mm_load_ps (float const* mem_addr)
__m128 _mm_load_ps1 (float const* mem_addr)
__m128d _mm_load_sd (double const* mem_addr)
__m128i _mm_load_si128 (__m128i const* mem_addr)
__m128 _mm_load_ss (float const* mem_addr)
__m128d _mm_load1_pd (double const* mem_addr)
__m128 _mm_load1_ps (float const* mem_addr)
__m128d _mm_loaddup_pd (double const* mem_addr)
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
__m128d _mm_loadr_pd (double const* mem_addr)
__m128 _mm_loadr_ps (float const* mem_addr)
__m128d _mm_loadu_pd (double const* mem_addr)
__m128 _mm_loadu_ps (float const* mem_addr)
__m128i _mm_loadu_si128 (__m128i const* mem_addr)
__m128i _mm_loadu_si16 (void const* mem_addr)
__m128i _mm_loadu_si32 (void const* mem_addr)
__m128i _mm_loadu_si64 (void const* mem_addr)
2、OP操作
Arithmetic算术
__m128i _mm_add_epi16 (__m128i a, __m128i b)
__m128i _mm_add_epi32 (__m128i a, __m128i b)
__m128i _mm_add_epi64 (__m128i a, __m128i b)
__m128i _mm_add_epi8 (__m128i a, __m128i b)
__m128d _mm_add_pd (__m128d a, __m128d b)
__m128 _mm_add_ps (__m128 a, __m128 b)
__m128d _mm_add_sd (__m128d a, __m128d b)
__m64 _mm_add_si64 (__m64 a, __m64 b)
__m128 _mm_add_ss (__m128 a, __m128 b)
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
__m128i _mm_adds_epu8 (__m128i a, __m128i b)
__m128d _mm_addsub_pd (__m128d a, __m128d b)
__m128 _mm_addsub_ps (__m128 a, __m128 b)
__m128d _mm_div_pd (__m128d a, __m128d b)
__m128 _mm_div_ps (__m128 a, __m128 b)
__m128d _mm_div_sd (__m128d a, __m128d b)
__m128 _mm_div_ss (__m128 a, __m128 b)
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
__m128d _mm_hadd_pd (__m128d a, __m128d b)
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
__m128 _mm_hadd_ps (__m128 a, __m128 b)
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
__m128d _mm_hsub_pd (__m128d a, __m128d b)
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
__m128 _mm_hsub_ps (__m128 a, __m128 b)
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
__m128i _mm_madd_epi16 (__m128i a, __m128i b)
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
__m128i _mm_mul_epi32 (__m128i a, __m128i b)
__m128i _mm_mul_epu32 (__m128i a, __m128i b)
__m128d _mm_mul_pd (__m128d a, __m128d b)
__m128 _mm_mul_ps (__m128 a, __m128 b)
__m128d _mm_mul_sd (__m128d a, __m128d b)
__m128 _mm_mul_ss (__m128 a, __m128 b)
__m64 _mm_mul_su32 (__m64 a, __m64 b)
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
__m64 _m_pmulhuw (__m64 a, __m64 b)
__m64 _m_psadbw (__m64 a, __m64 b)
__m128i _mm_sad_epu8 (__m128i a, __m128i b)
__m64 _mm_sad_pu8 (__m64 a, __m64 b)
__m128i _mm_sign_epi16 (__m128i a, __m128i b)
__m128i _mm_sign_epi32 (__m128i a, __m128i b)
__m128i _mm_sign_epi8 (__m128i a, __m128i b)
__m64 _mm_sign_pi16 (__m64 a, __m64 b)
__m64 _mm_sign_pi32 (__m64 a, __m64 b)
__m64 _mm_sign_pi8 (__m64 a, __m64 b)
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
__m128i _mm_sub_epi32 (__m128i a, __m128i b)
__m128i _mm_sub_epi64 (__m128i a, __m128i b)
__m128i _mm_sub_epi8 (__m128i a, __m128i b)
__m128d _mm_sub_pd (__m128d a, __m128d b)
__m128 _mm_sub_ps (__m128 a, __m128 b)
__m128d _mm_sub_sd (__m128d a, __m128d b)
__m64 _mm_sub_si64 (__m64 a, __m64 b)
__m128 _mm_sub_ss (__m128 a, __m128 b)
__m128i _mm_subs_epi16 (__m128i a, __m128i b)
__m128i _mm_subs_epi8 (__m128i a, __m128i b)
__m128i _mm_subs_epu16 (__m128i a, __m128i b)
__m128i _mm_subs_epu8 (__m128i a, __m128i b)
Compare比较
__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
__m128d _mm_cmpge_pd (__m128d a, __m128d b)
__m128 _mm_cmpge_ps (__m128 a, __m128 b)
__m128d _mm_cmpge_sd (__m128d a, __m128d b)
__m128 _mm_cmpge_ss (__m128 a, __m128 b)
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
__m128d _mm_cmple_pd (__m128d a, __m128d b)
__m128 _mm_cmple_ps (__m128 a, __m128 b)
__m128d _mm_cmple_sd (__m128d a, __m128d b)
__m128 _mm_cmple_ss (__m128 a, __m128 b)
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmplt_pd (__m128d a, __m128d b)
__m128 _mm_cmplt_ps (__m128 a, __m128 b)
__m128d _mm_cmplt_sd (__m128d a, __m128d b)
__m128 _mm_cmplt_ss (__m128 a, __m128 b)
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
__m128d _mm_cmpord_pd (__m128d a, __m128d b)
__m128 _mm_cmpord_ps (__m128 a, __m128 b)
__m128d _mm_cmpord_sd (__m128d a, __m128d b)
__m128 _mm_cmpord_ss (__m128 a, __m128 b)
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
int _mm_comieq_sd (__m128d a, __m128d b)
int _mm_comieq_ss (__m128 a, __m128 b)
int _mm_comige_sd (__m128d a, __m128d b)
int _mm_comige_ss (__m128 a, __m128 b)
int _mm_comigt_sd (__m128d a, __m128d b)
int _mm_comigt_ss (__m128 a, __m128 b)
int _mm_comile_sd (__m128d a, __m128d b)
int _mm_comile_ss (__m128 a, __m128 b)
int _mm_comilt_sd (__m128d a, __m128d b)
int _mm_comilt_ss (__m128 a, __m128 b)
int _mm_comineq_sd (__m128d a, __m128d b)
int _mm_comineq_ss (__m128 a, __m128 b)
int _mm_ucomieq_sd (__m128d a, __m128d b)
int _mm_ucomieq_ss (__m128 a, __m128 b)
int _mm_ucomige_sd (__m128d a, __m128d b)
int _mm_ucomige_ss (__m128 a, __m128 b)
int _mm_ucomigt_sd (__m128d a, __m128d b)
int _mm_ucomigt_ss (__m128 a, __m128 b)
int _mm_ucomile_sd (__m128d a, __m128d b)
int _mm_ucomile_ss (__m128 a, __m128 b)
int _mm_ucomilt_sd (__m128d a, __m128d b)
int _mm_ucomilt_ss (__m128 a, __m128 b)
int _mm_ucomineq_sd (__m128d a, __m128d b)
int _mm_ucomineq_ss (__m128 a, __m128 b)
Convert转换
__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
__m64 _mm_cvt_ps2pi (__m128 a)
__m128 _mm_cvt_si2ss (__m128 a, int b)
int _mm_cvt_ss2si (__m128 a)
__m128i _mm_cvtepi16_epi32 (__m128i a)
__m128i _mm_cvtepi16_epi64 (__m128i a)
__m128i _mm_cvtepi32_epi64 (__m128i a)
__m128d _mm_cvtepi32_pd (__m128i a)
__m128 _mm_cvtepi32_ps (__m128i a)
__m128i _mm_cvtepi8_epi16 (__m128i a)
__m128i _mm_cvtepi8_epi32 (__m128i a)
__m128i _mm_cvtepi8_epi64 (__m128i a)
__m128i _mm_cvtepu16_epi32 (__m128i a)
__m128i _mm_cvtepu16_epi64 (__m128i a)
__m128i _mm_cvtepu32_epi64 (__m128i a)
__m128i _mm_cvtepu8_epi16 (__m128i a)
__m128i _mm_cvtepu8_epi32 (__m128i a)
__m128i _mm_cvtepu8_epi64 (__m128i a)
__m128i _mm_cvtpd_epi32 (__m128d a)
__m64 _mm_cvtpd_pi32 (__m128d a)
__m128 _mm_cvtpd_ps (__m128d a)
__m128 _mm_cvtpi16_ps (__m64 a)
__m128d _mm_cvtpi32_pd (__m64 a)
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
__m128 _mm_cvtpi8_ps (__m64 a)
__m128i _mm_cvtps_epi32 (__m128 a)
__m128d _mm_cvtps_pd (__m128 a)
__m64 _mm_cvtps_pi16 (__m128 a)
__m64 _mm_cvtps_pi32 (__m128 a)
__m64 _mm_cvtps_pi8 (__m128 a)
__m128 _mm_cvtpu16_ps (__m64 a)
__m128 _mm_cvtpu8_ps (__m64 a)
double _mm_cvtsd_f64 (__m128d a)
int _mm_cvtsd_si32 (__m128d a)
__int64 _mm_cvtsd_si64 (__m128d a)
__int64 _mm_cvtsd_si64x (__m128d a)
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
int _mm_cvtsi128_si32 (__m128i a)
__int64 _mm_cvtsi128_si64 (__m128i a)
__int64 _mm_cvtsi128_si64x (__m128i a)
__m128d _mm_cvtsi32_sd (__m128d a, int b)
__m128i _mm_cvtsi32_si128 (int a)
__m128 _mm_cvtsi32_ss (__m128 a, int b)
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64_si128 (__int64 a)
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64x_si128 (__int64 a)
float _mm_cvtss_f32 (__m128 a)
__m128d _mm_cvtss_sd (__m128d a, __m128 b)
int _mm_cvtss_si32 (__m128 a)
__int64 _mm_cvtss_si64 (__m128 a)
__m64 _mm_cvtt_ps2pi (__m128 a)
int _mm_cvtt_ss2si (__m128 a)
__m128i _mm_cvttpd_epi32 (__m128d a)
__m64 _mm_cvttpd_pi32 (__m128d a)
__m128i _mm_cvttps_epi32 (__m128 a)
__m64 _mm_cvttps_pi32 (__m128 a)
int _mm_cvttsd_si32 (__m128d a)
__int64 _mm_cvttsd_si64 (__m128d a)
__int64 _mm_cvttsd_si64x (__m128d a)
int _mm_cvttss_si32 (__m128 a)
__int64 _mm_cvttss_si64 (__m128 a)
__m128i _mm_packus_epi32 (__m128i a, __m128i b)
Logical逻辑
__m128d _mm_and_pd (__m128d a, __m128d b)
__m128 _mm_and_ps (__m128 a, __m128 b)
__m128i _mm_and_si128 (__m128i a, __m128i b)
__m128d _mm_andnot_pd (__m128d a, __m128d b)
__m128 _mm_andnot_ps (__m128 a, __m128 b)
__m128i _mm_andnot_si128 (__m128i a, __m128i b)
__m128d _mm_or_pd (__m128d a, __m128d b)
__m128 _mm_or_ps (__m128 a, __m128 b)
__m128i _mm_or_si128 (__m128i a, __m128i b)
int _mm_test_all_ones (__m128i a)
int _mm_test_all_zeros (__m128i mask, __m128i a)
int _mm_test_mix_ones_zeros (__m128i mask, __m128i a)
int _mm_testc_si128 (__m128i a, __m128i b)
int _mm_testnzc_si128 (__m128i a, __m128i b)
int _mm_testz_si128 (__m128i a, __m128i b)
__m128d _mm_xor_pd (__m128d a, __m128d b)
__m128 _mm_xor_ps (__m128 a, __m128 b)
__m128i _mm_xor_si128 (__m128i a, __m128i b)
Set设置
__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_set_pd (double e1, double e0)
__m128d _mm_set_pd1 (double a)
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
__m128 _mm_set_ps1 (float a)
__m128d _mm_set_sd (double a)
__m128 _mm_set_ss (float a)
__m128i _mm_set1_epi16 (short a)
__m128i _mm_set1_epi32 (int a)
__m128i _mm_set1_epi64 (__m64 a)
__m128i _mm_set1_epi64x (__int64 a)
__m128i _mm_set1_epi8 (char a)
__m128d _mm_set1_pd (double a)
__m128 _mm_set1_ps (float a)
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_setr_pd (double e1, double e0)
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
__m128d _mm_setzero_pd (void)
__m128 _mm_setzero_ps (void)
__m128i _mm_setzero_si128 ()
3、Store存储
void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
void _mm_store_pd (double* mem_addr, __m128d a)
void _mm_store_pd1 (double* mem_addr, __m128d a)
void _mm_store_ps (float* mem_addr, __m128 a)
void _mm_store_ps1 (float* mem_addr, __m128 a)
void _mm_store_sd (double* mem_addr, __m128d a)
void _mm_store_si128 (__m128i* mem_addr, __m128i a)
void _mm_store_ss (float* mem_addr, __m128 a)
void _mm_store1_pd (double* mem_addr, __m128d a)
void _mm_store1_ps (float* mem_addr, __m128 a)
void _mm_storeh_pd (double* mem_addr, __m128d a)
void _mm_storeh_pi (__m64* mem_addr, __m128 a)
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
void _mm_storel_pd (double* mem_addr, __m128d a)
void _mm_storel_pi (__m64* mem_addr, __m128 a)
void _mm_storer_pd (double* mem_addr, __m128d a)
void _mm_storer_ps (float* mem_addr, __m128 a)
void _mm_storeu_pd (double* mem_addr, __m128d a)
void _mm_storeu_ps (float* mem_addr, __m128 a)
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
void _mm_storeu_si16 (void* mem_addr, __m128i a)
void _mm_storeu_si32 (void* mem_addr, __m128i a)
void _mm_storeu_si64 (void* mem_addr, __m128i a)
void _mm_stream_pd (double* mem_addr, __m128d a)
void _mm_stream_pi (__m64* mem_addr, __m64 a)
void _mm_stream_ps (float* mem_addr, __m128 a)
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
void _mm_stream_si32 (int* mem_addr, int a)
void _mm_stream_si64 (__int64* mem_addr, __int64 a)
参考
1、https://www.zhihu.com/column/c_1550937293912748032
2、https://zhuanlan.zhihu.com/p/409973153
3、https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=4880,3865,6557&techs=SSE_ALL
相关文章:
<C++> SSE指令集
SSE指令集 include库 #include <mmintrin.h> //MMX #include <xmmintrin.h> //SSE(include mmintrin.h) #include <emmintrin.h> //SSE2(include xmmintrin.h) #include <pmmintrin.h> //SSE3(include emmintrin.h) #include <tmmintrin.h> /…...
cortex-A7核LED灯实验--STM32MP157
实验目的:实现LED1 / LED2 / LED3三盏灯工作 一,分析电路图 1,思路 分析电路图可知: 网络编号 引脚编号 LED1 PE10 LED2 > PF10 LED3 > PE8 2,工作原理: 写1:LED灯亮…...
WPF实战项目十三(API篇):备忘录功能api接口、优化待办事项api接口
1、新建MenoDto.cs /// <summary>/// 备忘录传输实体/// </summary>public class MenoDto : BaseDto{private string title;/// <summary>/// 标题/// </summary>public string Title{get { return title; }set { title value; }}private string con…...
clickhouse(十四、分布式DDL阻塞及同步阻塞问题)
文章目录 一、分布式ddl 阻塞、超时现象验证方法解决方案 二、副本同步阻塞现象验证解决方案 一、分布式ddl 阻塞、超时 现象 在clickhouse 集群的操作中,如果同时执行一些重量级变更语句,往往会引起阻塞。 一般是由于节点堆积过多耗时的ddl。然后抛出…...
怎么入门网络安全(黑客)?
目录: 一、自学网络安全学习的误区和陷阱 1.不要试图先成为一名程序员(以编程为基础的学习)再开始学习2.不要把深度学习作为入门第一课3.以黑客技能、兴趣为方向的自学误区:4.不要收集过多的资料二、学习网络安全的一些前期准备三…...
c++ boost::json
Boost社区12月11日发布了1.75版本,在之前,Boost使用Boost.PropertyTree解析JSON,XML,INI和INFO格式的文件。但是由于成文较早及需要兼容其他的数据格式,相比较于其他的…...
《Flink学习笔记》——第九章 多流转换
无论是基本的简单转换和聚合,还是基于窗口的计算,我们都是针对一条流上的数据进行处理的。而在实际应用中,可能需要将不同来源的数据连接合并在一起处理,也有可能需要将一条流拆分开,所以经常会有对多条流进行处理的场…...
openmmlab出现KeyError: ‘xxx is not in the model registry....‘
问题描述 在复现基于mmpose框架的算法时,运行程序出现KeyError: xxx is not in the model registry....的问题,报错原因是自定义的backbone等结构或者某些当前代码使用的方法没有注册到现有的包中, 导致在import的时候无法导入该方法。 解决方案 找到…...
错误代码0x80131500要怎么解决?快速修复方法
错误代码0x80131500通常与.NET Framework 相关的问题有关。它可能表示.NET Framework的安装损坏、版本冲突或系统文件缺失等。下面我们一起来探讨一下解决错误代码0x80131500有哪些。 以下是一些解决方法 安装最新的.NET Framework版本:访问Microsoft官方网站&…...
PMO(Project Management Office)
PMO 是项目管理办公室(Project Management Office)的缩写。它是组织内的一个部门或团队,负责支持和促进项目管理活动,以确保项目按时、按预算、按要求完成。 PMO 的职责和角色可以因组织的性质和需求而有所不同,但通常…...
STM32 CUBEMX CAN通信数据发送失败原因分析
CAN通信是一种数据通信协议,用于在不同设备之间进行通信。它是一种高效的、实时的、可靠的、多主机的、串行通信系统,通常用于汽车电子、工业自动化等领域。CAN通信协议是由德国BOSCH公司于1986年引入,并在欧洲和日本广泛使用。CAN通信具有独…...
长安链并行调度机制(2):DAG构建和从节点执行流程
长安链采用高效的并行调度方式执行交易,了解长安链交易调度、冲突检测和DAG构建流程有助于开发者更好地理解长安链并行调度的运行机制,帮助开发者编写高质量、低冲突的智能合约,更好地构建区块链应用。 上一篇内容我们说明了长安链交易调度、…...
leetcode做题笔记110. 平衡二叉树
给定一个二叉树,判断它是否是高度平衡的二叉树。 本题中,一棵高度平衡二叉树定义为: 一个二叉树每个节点 的左右两个子树的高度差的绝对值不超过 1 。 思路一:递归 int height(struct TreeNode* root) {if (root NULL) {return…...
iOS开发Swift-字符串与字符
1.字符串的定义 let someString "some string value"2.多行字符串的定义(""") let quotation """ 有一个人前来买瓜。 "这瓜甜吗?"他问。 """前一个"""前和后一个""&…...
Linux Kernel:syscall之fork与exec
环境: Kernel Version:Linux-5.10 ARCH:ARM64 一:前言 上一节我们提到了进程的产生方式fork,exec与clone,本节将详细分析fork和exec族系统调用的具体实现。通常这些调用不是由应用程序直接发出的,而是通过一个中间层调用,即负责与内核通信的C标准库。从用户状态切换到…...
CentOS 修改MySQL密码
CentOS 修改MySQL密码 1.登录MySQL 2.执行如下命令 update user set passwordpassword(mivbAs7Awc) where userroot;报错如下: Unknown column ‘password’ in ‘field list’ 3.执行如下命令 update user set passwordpassword(mivbAs7Awc) where userroot碰到…...
Android通过setaffinity实现绑核
有时候为了降低App算力占用,会把关键的线程绑定到大核中,下面介绍一种绑核的方式 查看绑核 查看pid :/ # ps -A | grep test u0_a15 25178 405 15950272 176544 do_epoll_wait 0 S com.test.jnites查看线程号 top -H -p 25178 25224 u0_…...
stm32的位带操作
在51单片机中,我们可以使用P2^1来对单片机的某一位进行操作,到了stm32,我们通过位带操作,将寄存器的每一位映射到一个32位的地址。如下是我查资料摘录的一些图片。 映射方式 SRAM: AliasAddr 0x22000000 (A-0X20000000)*8*4n*4…...
Java 电子招标采购系统源码:营造全面规范安全的电子招投标环境,促进招投标市场健康可持续发展
营造全面规范安全的电子招投标环境,促进招投标市场健康可持续发展 传统采购模式面临的挑战 一、立项管理 1、招标立项申请 功能点:招标类项目立项申请入口,用户可以保存为草稿,提交。 2、非招标立项申请 功能点:非招标…...
https协议经过SpringMVC重定向之后变成http协议
之前项目的协议还是http,当改为https之后,就出现了这个问题。 服务访问地址:https://wuxinke.demo.com 访问某个页面的地址:https://wuxinke.demo.com/aps/judgeProviderOrCtenant.ht 经SpringMVC重定向之后,地址变…...
大语言模型如何处理长文本?常用文本分割技术详解
为什么需要文本分割? 引言:为什么需要文本分割?一、基础文本分割方法1. 按段落分割(Paragraph Splitting)2. 按句子分割(Sentence Splitting)二、高级文本分割策略3. 重叠分割(Sliding Window)4. 递归分割(Recursive Splitting)三、生产级工具推荐5. 使用LangChain的…...
C++中string流知识详解和示例
一、概览与类体系 C 提供三种基于内存字符串的流,定义在 <sstream> 中: std::istringstream:输入流,从已有字符串中读取并解析。std::ostringstream:输出流,向内部缓冲区写入内容,最终取…...
Redis的发布订阅模式与专业的 MQ(如 Kafka, RabbitMQ)相比,优缺点是什么?适用于哪些场景?
Redis 的发布订阅(Pub/Sub)模式与专业的 MQ(Message Queue)如 Kafka、RabbitMQ 进行比较,核心的权衡点在于:简单与速度 vs. 可靠与功能。 下面我们详细展开对比。 Redis Pub/Sub 的核心特点 它是一个发后…...
【Java学习笔记】BigInteger 和 BigDecimal 类
BigInteger 和 BigDecimal 类 二者共有的常见方法 方法功能add加subtract减multiply乘divide除 注意点:传参类型必须是类对象 一、BigInteger 1. 作用:适合保存比较大的整型数 2. 使用说明 创建BigInteger对象 传入字符串 3. 代码示例 import j…...
HarmonyOS运动开发:如何用mpchart绘制运动配速图表
##鸿蒙核心技术##运动开发##Sensor Service Kit(传感器服务)# 前言 在运动类应用中,运动数据的可视化是提升用户体验的重要环节。通过直观的图表展示运动过程中的关键数据,如配速、距离、卡路里消耗等,用户可以更清晰…...
在QWebEngineView上实现鼠标、触摸等事件捕获的解决方案
这个问题我看其他博主也写了,要么要会员、要么写的乱七八糟。这里我整理一下,把问题说清楚并且给出代码,拿去用就行,照着葫芦画瓢。 问题 在继承QWebEngineView后,重写mousePressEvent或event函数无法捕获鼠标按下事…...
Chromium 136 编译指南 Windows篇:depot_tools 配置与源码获取(二)
引言 工欲善其事,必先利其器。在完成了 Visual Studio 2022 和 Windows SDK 的安装后,我们即将接触到 Chromium 开发生态中最核心的工具——depot_tools。这个由 Google 精心打造的工具集,就像是连接开发者与 Chromium 庞大代码库的智能桥梁…...
pikachu靶场通关笔记19 SQL注入02-字符型注入(GET)
目录 一、SQL注入 二、字符型SQL注入 三、字符型注入与数字型注入 四、源码分析 五、渗透实战 1、渗透准备 2、SQL注入探测 (1)输入单引号 (2)万能注入语句 3、获取回显列orderby 4、获取数据库名database 5、获取表名…...
破解路内监管盲区:免布线低位视频桩重塑停车管理新标准
城市路内停车管理常因行道树遮挡、高位设备盲区等问题,导致车牌识别率低、逃费率高,传统模式在复杂路段束手无策。免布线低位视频桩凭借超低视角部署与智能算法,正成为破局关键。该设备安装于车位侧方0.5-0.7米高度,直接规避树枝遮…...
Vue 模板语句的数据来源
🧩 Vue 模板语句的数据来源:全方位解析 Vue 模板(<template> 部分)中的表达式、指令绑定(如 v-bind, v-on)和插值({{ }})都在一个特定的作用域内求值。这个作用域由当前 组件…...
