当前位置: 首页 > news >正文

<C++> SSE指令集

SSE指令集

include库

#include <mmintrin.h>  //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h> //SSSE3(include pmmintrin.h)
#include <smmintrin.h> //SSE4.1(include tmmintrin.h)
#include <nmmintrin.h> //SSE4.2(include smmintrin.h)
#include <wmmintrin.h> //AES(include nmmintrin.h)
#include <immintrin.h> //AVX(include wmmintrin.h)
#include <intrin.h>    //所有版本(include immintrin.h)

基本操作

  1. 使用SSE专门的LOAD指令将数据从内存加载一个向量到寄存器;
  2. 使用SSE专门的OP指令对两个向量进行某种计算;
  3. 使用SSE专门的STORE指令把计算结果从寄存器写回到内存;

数据类型

  • __m128表示128bit的单精度浮点数
typedef union __declspec(intrin_type) __declspec(align(16)) __m128 {float               m128_f32[4];unsigned __int64    m128_u64[2];__int8              m128_i8[16];__int16             m128_i16[8];__int32             m128_i32[4];__int64             m128_i64[2];unsigned __int8     m128_u8[16];unsigned __int16    m128_u16[8];unsigned __int32    m128_u32[4];} __m128;
  • __m128i表示128bit的整数型
typedef union __declspec(intrin_type) __declspec(align(16)) __m128i {__int8              m128i_i8[16];__int16             m128i_i16[8];__int32             m128i_i32[4];__int64             m128i_i64[2];unsigned __int8     m128i_u8[16];unsigned __int16    m128i_u16[8];unsigned __int32    m128i_u32[4];unsigned __int64    m128i_u64[2];
} __m128i;
  • __128d表示128bit的双精度浮点数
typedef struct __declspec(intrin_type) __declspec(align(16)) __m128d {double              m128d_f64[2];
} __m128d;

指令函数命名

SSE指令的函数从命名上,主要分成三部分,以_mm_loadu_pd为例:

  1. 第一部分均以_mm开头,表示属于SSE指令集,_mm256或_mm512是AVX或AVX-512指令集的Intrinsic函数前缀;
  2. 第二部分表明操作类型,比如load,add,store等。但部分指令后面跟有[l|h|u|r]等字母,比如u表示mem_addr不需要内存对齐,r表示反向读取等;
  3. 第三部分为操作的对象名及数据类型:
    _ps:packed操作所有的单精度浮点数;
    _pd:packed操作所有的双精度浮点数;
    _pixx:(xx为长度,可以是8,16,32,64)packed操作所有的xx位有符号整数,使用的寄存器长度为64位;
    _epixx:(xx为长度)packed操作所有的xx位的有符号整数,使用的寄存器长度为128位;
    _epuxx: packed操作所有的xx位的无符号整数;
    _ss:scalar操作第一个单精度浮点数;
    p表示packed即对128bits的数据全部执行相同的操作,s表示scalar,只对128bits中的第一组数据执行操作,如下图所示。
    在这里插入图片描述

1、load加载

__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
__m128d _mm_load_pd (double const* mem_addr)
__m128d _mm_load_pd1 (double const* mem_addr)
__m128 _mm_load_ps (float const* mem_addr)
__m128 _mm_load_ps1 (float const* mem_addr)
__m128d _mm_load_sd (double const* mem_addr)
__m128i _mm_load_si128 (__m128i const* mem_addr)
__m128 _mm_load_ss (float const* mem_addr)
__m128d _mm_load1_pd (double const* mem_addr)
__m128 _mm_load1_ps (float const* mem_addr)
__m128d _mm_loaddup_pd (double const* mem_addr)
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
__m128d _mm_loadr_pd (double const* mem_addr)
__m128 _mm_loadr_ps (float const* mem_addr)
__m128d _mm_loadu_pd (double const* mem_addr)
__m128 _mm_loadu_ps (float const* mem_addr)
__m128i _mm_loadu_si128 (__m128i const* mem_addr)
__m128i _mm_loadu_si16 (void const* mem_addr)
__m128i _mm_loadu_si32 (void const* mem_addr)
__m128i _mm_loadu_si64 (void const* mem_addr)

2、OP操作

Arithmetic算术

__m128i _mm_add_epi16 (__m128i a, __m128i b)
__m128i _mm_add_epi32 (__m128i a, __m128i b)
__m128i _mm_add_epi64 (__m128i a, __m128i b)
__m128i _mm_add_epi8 (__m128i a, __m128i b)
__m128d _mm_add_pd (__m128d a, __m128d b)
__m128 _mm_add_ps (__m128 a, __m128 b)
__m128d _mm_add_sd (__m128d a, __m128d b)
__m64 _mm_add_si64 (__m64 a, __m64 b)
__m128 _mm_add_ss (__m128 a, __m128 b)
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
__m128i _mm_adds_epu8 (__m128i a, __m128i b)
__m128d _mm_addsub_pd (__m128d a, __m128d b)
__m128 _mm_addsub_ps (__m128 a, __m128 b)
__m128d _mm_div_pd (__m128d a, __m128d b)
__m128 _mm_div_ps (__m128 a, __m128 b)
__m128d _mm_div_sd (__m128d a, __m128d b)
__m128 _mm_div_ss (__m128 a, __m128 b)
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
__m128d _mm_hadd_pd (__m128d a, __m128d b)
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
__m128 _mm_hadd_ps (__m128 a, __m128 b)
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
__m128d _mm_hsub_pd (__m128d a, __m128d b)
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
__m128 _mm_hsub_ps (__m128 a, __m128 b)
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
__m128i _mm_madd_epi16 (__m128i a, __m128i b)
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
__m128i _mm_mul_epi32 (__m128i a, __m128i b)
__m128i _mm_mul_epu32 (__m128i a, __m128i b)
__m128d _mm_mul_pd (__m128d a, __m128d b)
__m128 _mm_mul_ps (__m128 a, __m128 b)
__m128d _mm_mul_sd (__m128d a, __m128d b)
__m128 _mm_mul_ss (__m128 a, __m128 b)
__m64 _mm_mul_su32 (__m64 a, __m64 b)
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
__m64 _m_pmulhuw (__m64 a, __m64 b)
__m64 _m_psadbw (__m64 a, __m64 b)
__m128i _mm_sad_epu8 (__m128i a, __m128i b)
__m64 _mm_sad_pu8 (__m64 a, __m64 b)
__m128i _mm_sign_epi16 (__m128i a, __m128i b)
__m128i _mm_sign_epi32 (__m128i a, __m128i b)
__m128i _mm_sign_epi8 (__m128i a, __m128i b)
__m64 _mm_sign_pi16 (__m64 a, __m64 b)
__m64 _mm_sign_pi32 (__m64 a, __m64 b)
__m64 _mm_sign_pi8 (__m64 a, __m64 b)
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
__m128i _mm_sub_epi32 (__m128i a, __m128i b)
__m128i _mm_sub_epi64 (__m128i a, __m128i b)
__m128i _mm_sub_epi8 (__m128i a, __m128i b)
__m128d _mm_sub_pd (__m128d a, __m128d b)
__m128 _mm_sub_ps (__m128 a, __m128 b)
__m128d _mm_sub_sd (__m128d a, __m128d b)
__m64 _mm_sub_si64 (__m64 a, __m64 b)
__m128 _mm_sub_ss (__m128 a, __m128 b)
__m128i _mm_subs_epi16 (__m128i a, __m128i b)
__m128i _mm_subs_epi8 (__m128i a, __m128i b)
__m128i _mm_subs_epu16 (__m128i a, __m128i b)
__m128i _mm_subs_epu8 (__m128i a, __m128i b)

Compare比较

__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
__m128d _mm_cmpge_pd (__m128d a, __m128d b)
__m128 _mm_cmpge_ps (__m128 a, __m128 b)
__m128d _mm_cmpge_sd (__m128d a, __m128d b)
__m128 _mm_cmpge_ss (__m128 a, __m128 b)
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
__m128d _mm_cmple_pd (__m128d a, __m128d b)
__m128 _mm_cmple_ps (__m128 a, __m128 b)
__m128d _mm_cmple_sd (__m128d a, __m128d b)
__m128 _mm_cmple_ss (__m128 a, __m128 b)
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
__m128d _mm_cmplt_pd (__m128d a, __m128d b)
__m128 _mm_cmplt_ps (__m128 a, __m128 b)
__m128d _mm_cmplt_sd (__m128d a, __m128d b)
__m128 _mm_cmplt_ss (__m128 a, __m128 b)
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
__m128d _mm_cmpord_pd (__m128d a, __m128d b)
__m128 _mm_cmpord_ps (__m128 a, __m128 b)
__m128d _mm_cmpord_sd (__m128d a, __m128d b)
__m128 _mm_cmpord_ss (__m128 a, __m128 b)
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
int _mm_comieq_sd (__m128d a, __m128d b)
int _mm_comieq_ss (__m128 a, __m128 b)
int _mm_comige_sd (__m128d a, __m128d b)
int _mm_comige_ss (__m128 a, __m128 b)
int _mm_comigt_sd (__m128d a, __m128d b)
int _mm_comigt_ss (__m128 a, __m128 b)
int _mm_comile_sd (__m128d a, __m128d b)
int _mm_comile_ss (__m128 a, __m128 b)
int _mm_comilt_sd (__m128d a, __m128d b)
int _mm_comilt_ss (__m128 a, __m128 b)
int _mm_comineq_sd (__m128d a, __m128d b)
int _mm_comineq_ss (__m128 a, __m128 b)
int _mm_ucomieq_sd (__m128d a, __m128d b)
int _mm_ucomieq_ss (__m128 a, __m128 b)
int _mm_ucomige_sd (__m128d a, __m128d b)
int _mm_ucomige_ss (__m128 a, __m128 b)
int _mm_ucomigt_sd (__m128d a, __m128d b)
int _mm_ucomigt_ss (__m128 a, __m128 b)
int _mm_ucomile_sd (__m128d a, __m128d b)
int _mm_ucomile_ss (__m128 a, __m128 b)
int _mm_ucomilt_sd (__m128d a, __m128d b)
int _mm_ucomilt_ss (__m128 a, __m128 b)
int _mm_ucomineq_sd (__m128d a, __m128d b)
int _mm_ucomineq_ss (__m128 a, __m128 b)

Convert转换

__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
__m64 _mm_cvt_ps2pi (__m128 a)
__m128 _mm_cvt_si2ss (__m128 a, int b)
int _mm_cvt_ss2si (__m128 a)
__m128i _mm_cvtepi16_epi32 (__m128i a)
__m128i _mm_cvtepi16_epi64 (__m128i a)
__m128i _mm_cvtepi32_epi64 (__m128i a)
__m128d _mm_cvtepi32_pd (__m128i a)
__m128 _mm_cvtepi32_ps (__m128i a)
__m128i _mm_cvtepi8_epi16 (__m128i a)
__m128i _mm_cvtepi8_epi32 (__m128i a)
__m128i _mm_cvtepi8_epi64 (__m128i a)
__m128i _mm_cvtepu16_epi32 (__m128i a)
__m128i _mm_cvtepu16_epi64 (__m128i a)
__m128i _mm_cvtepu32_epi64 (__m128i a)
__m128i _mm_cvtepu8_epi16 (__m128i a)
__m128i _mm_cvtepu8_epi32 (__m128i a)
__m128i _mm_cvtepu8_epi64 (__m128i a)
__m128i _mm_cvtpd_epi32 (__m128d a)
__m64 _mm_cvtpd_pi32 (__m128d a)
__m128 _mm_cvtpd_ps (__m128d a)
__m128 _mm_cvtpi16_ps (__m64 a)
__m128d _mm_cvtpi32_pd (__m64 a)
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
__m128 _mm_cvtpi8_ps (__m64 a)
__m128i _mm_cvtps_epi32 (__m128 a)
__m128d _mm_cvtps_pd (__m128 a)
__m64 _mm_cvtps_pi16 (__m128 a)
__m64 _mm_cvtps_pi32 (__m128 a)
__m64 _mm_cvtps_pi8 (__m128 a)
__m128 _mm_cvtpu16_ps (__m64 a)
__m128 _mm_cvtpu8_ps (__m64 a)
double _mm_cvtsd_f64 (__m128d a)
int _mm_cvtsd_si32 (__m128d a)
__int64 _mm_cvtsd_si64 (__m128d a)
__int64 _mm_cvtsd_si64x (__m128d a)
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
int _mm_cvtsi128_si32 (__m128i a)
__int64 _mm_cvtsi128_si64 (__m128i a)
__int64 _mm_cvtsi128_si64x (__m128i a)
__m128d _mm_cvtsi32_sd (__m128d a, int b)
__m128i _mm_cvtsi32_si128 (int a)
__m128 _mm_cvtsi32_ss (__m128 a, int b)
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64_si128 (__int64 a)
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
__m128i _mm_cvtsi64x_si128 (__int64 a)
float _mm_cvtss_f32 (__m128 a)
__m128d _mm_cvtss_sd (__m128d a, __m128 b)
int _mm_cvtss_si32 (__m128 a)
__int64 _mm_cvtss_si64 (__m128 a)
__m64 _mm_cvtt_ps2pi (__m128 a)
int _mm_cvtt_ss2si (__m128 a)
__m128i _mm_cvttpd_epi32 (__m128d a)
__m64 _mm_cvttpd_pi32 (__m128d a)
__m128i _mm_cvttps_epi32 (__m128 a)
__m64 _mm_cvttps_pi32 (__m128 a)
int _mm_cvttsd_si32 (__m128d a)
__int64 _mm_cvttsd_si64 (__m128d a)
__int64 _mm_cvttsd_si64x (__m128d a)
int _mm_cvttss_si32 (__m128 a)
__int64 _mm_cvttss_si64 (__m128 a)
__m128i _mm_packus_epi32 (__m128i a, __m128i b)

Logical逻辑

__m128d _mm_and_pd (__m128d a, __m128d b)
__m128 _mm_and_ps (__m128 a, __m128 b)
__m128i _mm_and_si128 (__m128i a, __m128i b)
__m128d _mm_andnot_pd (__m128d a, __m128d b)
__m128 _mm_andnot_ps (__m128 a, __m128 b)
__m128i _mm_andnot_si128 (__m128i a, __m128i b)
__m128d _mm_or_pd (__m128d a, __m128d b)
__m128 _mm_or_ps (__m128 a, __m128 b)
__m128i _mm_or_si128 (__m128i a, __m128i b)
int _mm_test_all_ones (__m128i a)
int _mm_test_all_zeros (__m128i mask, __m128i a)
int _mm_test_mix_ones_zeros (__m128i mask, __m128i a)
int _mm_testc_si128 (__m128i a, __m128i b)
int _mm_testnzc_si128 (__m128i a, __m128i b)
int _mm_testz_si128 (__m128i a, __m128i b)
__m128d _mm_xor_pd (__m128d a, __m128d b)
__m128 _mm_xor_ps (__m128 a, __m128 b)
__m128i _mm_xor_si128 (__m128i a, __m128i b)

Set设置

__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_set_pd (double e1, double e0)
__m128d _mm_set_pd1 (double a)
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
__m128 _mm_set_ps1 (float a)
__m128d _mm_set_sd (double a)
__m128 _mm_set_ss (float a)
__m128i _mm_set1_epi16 (short a)
__m128i _mm_set1_epi32 (int a)
__m128i _mm_set1_epi64 (__m64 a)
__m128i _mm_set1_epi64x (__int64 a)
__m128i _mm_set1_epi8 (char a)
__m128d _mm_set1_pd (double a)
__m128 _mm_set1_ps (float a)
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
__m128d _mm_setr_pd (double e1, double e0)
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
__m128d _mm_setzero_pd (void)
__m128 _mm_setzero_ps (void)
__m128i _mm_setzero_si128 ()

3、Store存储

void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
void _mm_store_pd (double* mem_addr, __m128d a)
void _mm_store_pd1 (double* mem_addr, __m128d a)
void _mm_store_ps (float* mem_addr, __m128 a)
void _mm_store_ps1 (float* mem_addr, __m128 a)
void _mm_store_sd (double* mem_addr, __m128d a)
void _mm_store_si128 (__m128i* mem_addr, __m128i a)
void _mm_store_ss (float* mem_addr, __m128 a)
void _mm_store1_pd (double* mem_addr, __m128d a)
void _mm_store1_ps (float* mem_addr, __m128 a)
void _mm_storeh_pd (double* mem_addr, __m128d a)
void _mm_storeh_pi (__m64* mem_addr, __m128 a)
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
void _mm_storel_pd (double* mem_addr, __m128d a)
void _mm_storel_pi (__m64* mem_addr, __m128 a)
void _mm_storer_pd (double* mem_addr, __m128d a)
void _mm_storer_ps (float* mem_addr, __m128 a)
void _mm_storeu_pd (double* mem_addr, __m128d a)
void _mm_storeu_ps (float* mem_addr, __m128 a)
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
void _mm_storeu_si16 (void* mem_addr, __m128i a)
void _mm_storeu_si32 (void* mem_addr, __m128i a)
void _mm_storeu_si64 (void* mem_addr, __m128i a)
void _mm_stream_pd (double* mem_addr, __m128d a)
void _mm_stream_pi (__m64* mem_addr, __m64 a)
void _mm_stream_ps (float* mem_addr, __m128 a)
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
void _mm_stream_si32 (int* mem_addr, int a)
void _mm_stream_si64 (__int64* mem_addr, __int64 a)

参考

1、https://www.zhihu.com/column/c_1550937293912748032
2、https://zhuanlan.zhihu.com/p/409973153
3、https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=4880,3865,6557&techs=SSE_ALL

相关文章:

<C++> SSE指令集

SSE指令集 include库 #include <mmintrin.h> //MMX #include <xmmintrin.h> //SSE(include mmintrin.h) #include <emmintrin.h> //SSE2(include xmmintrin.h) #include <pmmintrin.h> //SSE3(include emmintrin.h) #include <tmmintrin.h> /…...

cortex-A7核LED灯实验--STM32MP157

实验目的&#xff1a;实现LED1 / LED2 / LED3三盏灯工作 一&#xff0c;分析电路图 1&#xff0c;思路 分析电路图可知&#xff1a; 网络编号 引脚编号 LED1 PE10 LED2 > PF10 LED3 > PE8 2&#xff0c;工作原理&#xff1a; 写1&#xff1a;LED灯亮&#xf…...

WPF实战项目十三(API篇):备忘录功能api接口、优化待办事项api接口

1、新建MenoDto.cs /// <summary>/// 备忘录传输实体/// </summary>public class MenoDto : BaseDto{private string title;/// <summary>/// 标题/// </summary>public string Title{get { return title; }set { title value; }}private string con…...

clickhouse(十四、分布式DDL阻塞及同步阻塞问题)

文章目录 一、分布式ddl 阻塞、超时现象验证方法解决方案 二、副本同步阻塞现象验证解决方案 一、分布式ddl 阻塞、超时 现象 在clickhouse 集群的操作中&#xff0c;如果同时执行一些重量级变更语句&#xff0c;往往会引起阻塞。 一般是由于节点堆积过多耗时的ddl。然后抛出…...

怎么入门网络安全(黑客)?

目录&#xff1a; 一、自学网络安全学习的误区和陷阱 1.不要试图先成为一名程序员&#xff08;以编程为基础的学习&#xff09;再开始学习2.不要把深度学习作为入门第一课3.以黑客技能、兴趣为方向的自学误区&#xff1a;4.不要收集过多的资料二、学习网络安全的一些前期准备三…...

c++ boost::json

Boost社区12月11日发布了1.75版本&#xff0c;在之前&#xff0c;​​Boost使用Boost.PropertyTree解析​​JSON​​​&#xff0c;​​XML​​​&#xff0c;​​INI​​​和​​INFO​​​格式的文件。但是由于成文较早及需要兼容其他的数据格式&#xff0c;相比较于其他的​…...

《Flink学习笔记》——第九章 多流转换

无论是基本的简单转换和聚合&#xff0c;还是基于窗口的计算&#xff0c;我们都是针对一条流上的数据进行处理的。而在实际应用中&#xff0c;可能需要将不同来源的数据连接合并在一起处理&#xff0c;也有可能需要将一条流拆分开&#xff0c;所以经常会有对多条流进行处理的场…...

openmmlab出现KeyError: ‘xxx is not in the model registry....‘

问题描述 在复现基于mmpose框架的算法时&#xff0c;运行程序出现KeyError: xxx is not in the model registry....的问题&#xff0c;报错原因是自定义的backbone等结构或者某些当前代码使用的方法没有注册到现有的包中, 导致在import的时候无法导入该方法。 解决方案 找到…...

错误代码0x80131500要怎么解决?快速修复方法

错误代码0x80131500通常与.NET Framework 相关的问题有关。它可能表示.NET Framework的安装损坏、版本冲突或系统文件缺失等。下面我们一起来探讨一下解决错误代码0x80131500有哪些。 以下是一些解决方法 安装最新的.NET Framework版本&#xff1a;访问Microsoft官方网站&…...

PMO(Project Management Office)

PMO 是项目管理办公室&#xff08;Project Management Office&#xff09;的缩写。它是组织内的一个部门或团队&#xff0c;负责支持和促进项目管理活动&#xff0c;以确保项目按时、按预算、按要求完成。 PMO 的职责和角色可以因组织的性质和需求而有所不同&#xff0c;但通常…...

STM32 CUBEMX CAN通信数据发送失败原因分析

CAN通信是一种数据通信协议&#xff0c;用于在不同设备之间进行通信。它是一种高效的、实时的、可靠的、多主机的、串行通信系统&#xff0c;通常用于汽车电子、工业自动化等领域。CAN通信协议是由德国BOSCH公司于1986年引入&#xff0c;并在欧洲和日本广泛使用。CAN通信具有独…...

长安链并行调度机制(2):DAG构建和从节点执行流程

长安链采用高效的并行调度方式执行交易&#xff0c;了解长安链交易调度、冲突检测和DAG构建流程有助于开发者更好地理解长安链并行调度的运行机制&#xff0c;帮助开发者编写高质量、低冲突的智能合约&#xff0c;更好地构建区块链应用。 上一篇内容我们说明了长安链交易调度、…...

leetcode做题笔记110. 平衡二叉树

给定一个二叉树&#xff0c;判断它是否是高度平衡的二叉树。 本题中&#xff0c;一棵高度平衡二叉树定义为&#xff1a; 一个二叉树每个节点 的左右两个子树的高度差的绝对值不超过 1 。 思路一&#xff1a;递归 int height(struct TreeNode* root) {if (root NULL) {return…...

iOS开发Swift-字符串与字符

1.字符串的定义 let someString "some string value"2.多行字符串的定义(""") let quotation """ 有一个人前来买瓜。 "这瓜甜吗&#xff1f;"他问。 """前一个"""前和后一个""&…...

Linux Kernel:syscall之fork与exec

环境: Kernel Version:Linux-5.10 ARCH:ARM64 一:前言 上一节我们提到了进程的产生方式fork,exec与clone,本节将详细分析fork和exec族系统调用的具体实现。通常这些调用不是由应用程序直接发出的,而是通过一个中间层调用,即负责与内核通信的C标准库。从用户状态切换到…...

CentOS 修改MySQL密码

CentOS 修改MySQL密码 1.登录MySQL 2.执行如下命令 update user set passwordpassword(mivbAs7Awc) where userroot;报错如下&#xff1a; Unknown column ‘password’ in ‘field list’ 3.执行如下命令 update user set passwordpassword(mivbAs7Awc) where userroot碰到…...

Android通过setaffinity实现绑核

有时候为了降低App算力占用&#xff0c;会把关键的线程绑定到大核中&#xff0c;下面介绍一种绑核的方式 查看绑核 查看pid :/ # ps -A | grep test u0_a15 25178 405 15950272 176544 do_epoll_wait 0 S com.test.jnites查看线程号 top -H -p 25178 25224 u0_…...

stm32的位带操作

在51单片机中&#xff0c;我们可以使用P2^1来对单片机的某一位进行操作&#xff0c;到了stm32&#xff0c;我们通过位带操作&#xff0c;将寄存器的每一位映射到一个32位的地址。如下是我查资料摘录的一些图片。 映射方式 SRAM: AliasAddr 0x22000000 (A-0X20000000)*8*4n*4…...

Java 电子招标采购系统源码:营造全面规范安全的电子招投标环境,促进招投标市场健康可持续发展

营造全面规范安全的电子招投标环境&#xff0c;促进招投标市场健康可持续发展 传统采购模式面临的挑战 一、立项管理 1、招标立项申请 功能点&#xff1a;招标类项目立项申请入口&#xff0c;用户可以保存为草稿&#xff0c;提交。 2、非招标立项申请 功能点&#xff1a;非招标…...

https协议经过SpringMVC重定向之后变成http协议

之前项目的协议还是http&#xff0c;当改为https之后&#xff0c;就出现了这个问题。 服务访问地址&#xff1a;https://wuxinke.demo.com 访问某个页面的地址&#xff1a;https://wuxinke.demo.com/aps/judgeProviderOrCtenant.ht 经SpringMVC重定向之后&#xff0c;地址变…...

iOS 分别对一张图的局部进行磨砂,拼接起来不能贴合

效果图 需求&#xff0c;由于视图层级的原因&#xff0c;需要对图片分开进行磨砂&#xff0c; 然后组合在一起 如图&#xff0c;上下两部分&#xff0c;上下两个UIImageVIew大小相同&#xff0c;都是和图片同样的大小&#xff0c;只是上面的UIimageVIew 只展示上半部份 &#…...

与面试官互动:建立积极的技术讨论氛围

&#x1f337;&#x1f341; 博主猫头虎 带您 Go to New World.✨&#x1f341; &#x1f984; 博客首页——猫头虎的博客&#x1f390; &#x1f433;《面试题大全专栏》 文章图文并茂&#x1f995;生动形象&#x1f996;简单易学&#xff01;欢迎大家来踩踩~&#x1f33a; &a…...

计算机竞赛 基于YOLO实现的口罩佩戴检测 - python opemcv 深度学习

文章目录 0 前言1 课题介绍2 算法原理2.1 算法简介2.2 网络架构 3 关键代码4 数据集4.1 安装4.2 打开4.3 选择yolo标注格式4.4 打标签4.5 保存 5 训练6 实现效果6.1 pyqt实现简单GUI6.3 视频识别效果6.4 摄像头实时识别 7 最后 0 前言 &#x1f525; 优质竞赛项目系列&#xf…...

完美解决Ubuntu网络故障,连接异常,IP地址一直显示127.0.0.1

终端输入ifconfig显示虚拟机IP地址为127.0.0.1&#xff0c;具体输出内容如下&#xff1a; wxyubuntu:~$ ifconfig lo: flags73<UP,LOOPBACK,RUNNING> mtu 65536inet 127.0.0.1 netmask 255.0.0.0inet6 ::1 prefixlen 128 scopeid 0x10<host>loop txqueuelen …...

手机无人直播软件有哪些,又有哪些优势?

如今&#xff0c;随着智能手机的普及和移动互联网的发展&#xff0c;手机无人直播成为了一个炙手可热的领域。手机无人直播软件为用户提供了便捷、灵活的直播方式&#xff0c;让更多商家人能够实现自己的直播带货的梦想。接下来&#xff0c;我们将探讨手机无人直播软件有哪些&a…...

解密算法与数据结构面试:程序员如何应对挑战

&#x1f337;&#x1f341; 博主猫头虎 带您 Go to New World.✨&#x1f341; &#x1f984; 博客首页——猫头虎的博客&#x1f390; &#x1f433;《面试题大全专栏》 文章图文并茂&#x1f995;生动形象&#x1f996;简单易学&#xff01;欢迎大家来踩踩~&#x1f33a; &a…...

分布式事务7种(秒懂-2PC、3PC、TCC、Saga、本地事务表、MQ事务消息、最大努力通)

参考文章&#xff1a; 七种常见分布式事务详解&#xff08;2PC、3PC、TCC、Saga、本地事务表、MQ事务消息、最大努力通知&#xff09;_张维鹏的博客-CSDN博客 分布式事务 &#xff08;秒懂&#xff09;_40岁资深老架构师尼恩的博客-CSDN博客 分布式事务&#xff1a;在分布式…...

基于Java+SpringBoot+Vue前后端分离美食推荐商城设计和实现

博主介绍&#xff1a;✌全网粉丝30W,csdn特邀作者、博客专家、CSDN新星计划导师、Java领域优质创作者,博客之星、掘金/华为云/阿里云/InfoQ等平台优质作者、专注于Java技术领域和毕业项目实战✌ &#x1f345;文末获取源码联系&#x1f345; &#x1f447;&#x1f3fb; 精彩专…...

最新ChatGPT程序源码+AI系统+详细图文搭建教程/支持GPT4/AI绘画/H5端/完整Prompt知识库

一、AI系统 如何搭建部署人工智能源码、AI创作系统、ChatGPT系统呢&#xff1f;小编这里写一个详细图文教程吧&#xff01;SparkAi使用Nestjs和Vue3框架技术&#xff0c;持续集成AI能力到AIGC系统&#xff01; 1.1 程序核心功能 程序已支持ChatGPT3.5/GPT-4提问、AI绘画、Mi…...

本地启动若依微服务版本

前置工作&#xff1a; 1.导入sql文件 2.安装完nacos 3.安装完redis 启动步骤&#xff1a; 1.开启nacos&#xff0c;在bin目录下 startup.cmd -m standalone 注意&#xff1a;在这之前要配置nacos持久化&#xff0c;修改conf/application.properties文件&#xff0c;增加支持…...