|
|
@ -359,6 +359,45 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
|
|
|
|
|
|
|
|
|
|
#define QK 32
|
|
|
|
#define QK 32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// AVX routines provided by GH user Const-me
|
|
|
|
|
|
|
|
// ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
|
|
|
|
|
|
|
|
#if __AVX2__
|
|
|
|
|
|
|
|
// Unpack 32 4-bit fields into 32 bytes
|
|
|
|
|
|
|
|
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
|
|
|
|
|
|
|
|
inline __m256i bytesFromNibbles( const uint8_t* rsi )
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
// Load 16 bytes from memory
|
|
|
|
|
|
|
|
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Expand bytes into uint16_t values
|
|
|
|
|
|
|
|
__m256i bytes = _mm256_cvtepu8_epi16( tmp );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Unpack values into individual bytes
|
|
|
|
|
|
|
|
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
|
|
|
|
|
|
|
__m256i high = _mm256_andnot_si256( lowMask, bytes );
|
|
|
|
|
|
|
|
__m256i low = _mm256_and_si256( lowMask, bytes );
|
|
|
|
|
|
|
|
high = _mm256_slli_epi16( high, 4 );
|
|
|
|
|
|
|
|
bytes = _mm256_or_si256( low, high );
|
|
|
|
|
|
|
|
return bytes;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline __m128i packNibbles( __m256i bytes )
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
|
|
|
|
|
|
|
const __m256i lowByte = _mm256_set1_epi16( 0xFF );
|
|
|
|
|
|
|
|
__m256i high = _mm256_andnot_si256( lowByte, bytes );
|
|
|
|
|
|
|
|
__m256i low = _mm256_and_si256( lowByte, bytes );
|
|
|
|
|
|
|
|
high = _mm256_srli_epi16( high, 4 );
|
|
|
|
|
|
|
|
bytes = _mm256_or_si256( low, high );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Compress uint16_t lanes into bytes
|
|
|
|
|
|
|
|
__m128i r0 = _mm256_castsi256_si128( bytes );
|
|
|
|
|
|
|
|
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
|
|
|
|
|
|
|
|
return _mm_packus_epi16( r0, r1 );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// method 5
|
|
|
|
// method 5
|
|
|
|
// blocks of QK elements
|
|
|
|
// blocks of QK elements
|
|
|
|
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
|
|
|
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
|
|
@ -414,6 +453,77 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
#error "not implemented for QK"
|
|
|
|
#error "not implemented for QK"
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#elif defined(__AVX2__)
|
|
|
|
|
|
|
|
#if QK == 32
|
|
|
|
|
|
|
|
for (int i = 0; i < nb; i++) {
|
|
|
|
|
|
|
|
// Load elements into 4 AVX vectors
|
|
|
|
|
|
|
|
__m256 v0 = _mm256_loadu_ps( x );
|
|
|
|
|
|
|
|
__m256 v1 = _mm256_loadu_ps( x + 8 );
|
|
|
|
|
|
|
|
__m256 v2 = _mm256_loadu_ps( x + 16 );
|
|
|
|
|
|
|
|
__m256 v3 = _mm256_loadu_ps( x + 24 );
|
|
|
|
|
|
|
|
x += 32;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Compute max(abs(e)) for the block
|
|
|
|
|
|
|
|
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
|
|
|
|
|
|
|
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
|
|
|
|
|
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
|
|
|
|
|
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
|
|
|
|
|
|
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
|
|
|
|
|
|
|
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
|
|
|
|
|
|
|
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
|
|
|
|
|
|
|
const float maxScalar = _mm_cvtss_f32( max4 );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Quantize these floats
|
|
|
|
|
|
|
|
const float d = maxScalar / 7.0f;
|
|
|
|
|
|
|
|
*(float *)pd = d;
|
|
|
|
|
|
|
|
pd += bs;
|
|
|
|
|
|
|
|
const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
|
|
|
|
|
|
|
|
const __m256 mul = _mm256_set1_ps( id );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Apply the multiplier
|
|
|
|
|
|
|
|
v0 = _mm256_mul_ps( v0, mul );
|
|
|
|
|
|
|
|
v1 = _mm256_mul_ps( v1, mul );
|
|
|
|
|
|
|
|
v2 = _mm256_mul_ps( v2, mul );
|
|
|
|
|
|
|
|
v3 = _mm256_mul_ps( v3, mul );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Round to nearest integer
|
|
|
|
|
|
|
|
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
|
|
|
|
|
|
|
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
|
|
|
|
|
|
|
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
|
|
|
|
|
|
|
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Convert floats to integers
|
|
|
|
|
|
|
|
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
|
|
|
|
|
|
|
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
|
|
|
|
|
|
|
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
|
|
|
|
|
|
|
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Convert int32 to int16
|
|
|
|
|
|
|
|
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
|
|
|
|
|
|
|
i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
|
|
|
|
|
|
|
|
// Convert int16 to int8
|
|
|
|
|
|
|
|
i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// We got our precious signed bytes, but the order is now wrong
|
|
|
|
|
|
|
|
// These AVX2 pack instructions process 16-byte pieces independently
|
|
|
|
|
|
|
|
// The following instruction is fixing the order
|
|
|
|
|
|
|
|
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
|
|
|
|
|
|
|
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
|
|
|
|
|
|
|
|
const __m256i off = _mm256_set1_epi8( 8 );
|
|
|
|
|
|
|
|
i0 = _mm256_add_epi8( i0, off );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Compress the vector into 4 bit/value, and store
|
|
|
|
|
|
|
|
__m128i res = packNibbles( i0 );
|
|
|
|
|
|
|
|
_mm_storeu_si128( ( __m128i* )pb, res );
|
|
|
|
|
|
|
|
pb += bs;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
#error "not implemented for QK"
|
|
|
|
|
|
|
|
#endif
|
|
|
|
#elif defined(__wasm_simd128__)
|
|
|
|
#elif defined(__wasm_simd128__)
|
|
|
|
#if QK == 32
|
|
|
|
#if QK == 32
|
|
|
|
for (int i = 0; i < nb; i++) {
|
|
|
|
for (int i = 0; i < nb; i++) {
|
|
|
@ -1285,6 +1395,61 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
#error "not implemented for QK"
|
|
|
|
#error "not implemented for QK"
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#elif defined(__AVX2__)
|
|
|
|
|
|
|
|
#if QK == 32
|
|
|
|
|
|
|
|
const size_t countBlocks = nb;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Initialize accumulator with zeros
|
|
|
|
|
|
|
|
__m256 acc = _mm256_setzero_ps();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Main loop
|
|
|
|
|
|
|
|
for (int i = 0; i < nb; ++i) {
|
|
|
|
|
|
|
|
const float * d0_0 = (const float *) (pd0 + i*bs);
|
|
|
|
|
|
|
|
const float * d1_0 = (const float *) (pd1 + i*bs);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const uint8_t * restrict p0 = pb0 + i*bs;
|
|
|
|
|
|
|
|
const uint8_t * restrict p1 = pb1 + i*bs;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Compute combined scale for the block
|
|
|
|
|
|
|
|
const __m256 scale = _mm256_mul_ps( _mm256_broadcast_ss( d0_0 ), _mm256_broadcast_ss( d1_0 ) );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
|
|
|
|
|
|
|
__m256i bx = bytesFromNibbles( p0 );
|
|
|
|
|
|
|
|
__m256i by = bytesFromNibbles( p1 );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
|
|
|
|
|
|
|
const __m256i off = _mm256_set1_epi8( 8 );
|
|
|
|
|
|
|
|
bx = _mm256_sub_epi8( bx, off );
|
|
|
|
|
|
|
|
by = _mm256_sub_epi8( by, off );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Sign-extend first 16 signed bytes into int16_t
|
|
|
|
|
|
|
|
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
|
|
|
|
|
|
|
|
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
|
|
|
|
|
|
|
|
// Compute products of int16_t integers, add pairwise
|
|
|
|
|
|
|
|
__m256i i32 = _mm256_madd_epi16( x16, y16 );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Sign-extend last 16 signed bytes into int16_t vectors
|
|
|
|
|
|
|
|
x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
|
|
|
|
|
|
|
|
y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
|
|
|
|
|
|
|
|
// Accumulate products of int16_t integers
|
|
|
|
|
|
|
|
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Convert int32_t to float
|
|
|
|
|
|
|
|
__m256 p = _mm256_cvtepi32_ps( i32 );
|
|
|
|
|
|
|
|
// Apply the scale, and accumulate
|
|
|
|
|
|
|
|
acc = _mm256_fmadd_ps( scale, p, acc );
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Return horizontal sum of the acc vector
|
|
|
|
|
|
|
|
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
|
|
|
|
|
|
|
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
|
|
|
|
|
|
|
|
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
|
|
|
|
|
|
|
|
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sumf = _mm_cvtss_f32( res );
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
#error "not implemented for QK"
|
|
|
|
|
|
|
|
#endif
|
|
|
|
#elif defined(__wasm_simd128__)
|
|
|
|
#elif defined(__wasm_simd128__)
|
|
|
|
#if QK == 32
|
|
|
|
#if QK == 32
|
|
|
|
// wasm simd
|
|
|
|
// wasm simd
|
|
|
|