Compare commits
23 Commits
Author | SHA1 | Date |
---|---|---|
![]() |
4c2f924553 | 2 years ago |
![]() |
ba3e8a3d7f | 2 years ago |
![]() |
2546cb7780 | 2 years ago |
![]() |
8f8a5aca99 | 2 years ago |
![]() |
efa2cc36a2 | 2 years ago |
![]() |
3b3ad42906 | 2 years ago |
![]() |
a6acb3318a | 2 years ago |
![]() |
47b297224e | 3 years ago |
![]() |
0467385010 | 3 years ago |
![]() |
fb64edddb7 | 3 years ago |
![]() |
c40a5b51a0 | 3 years ago |
![]() |
a0f2f68cdb | 3 years ago |
![]() |
dee3684fec | 3 years ago |
![]() |
6ed4da0b03 | 3 years ago |
![]() |
06e2a3b721 | 3 years ago |
![]() |
78af1420bf | 3 years ago |
![]() |
1af4cf0102 | 3 years ago |
![]() |
73a7916d30 | 3 years ago |
![]() |
e0abac1be7 | 3 years ago |
![]() |
45fc4fed0b | 3 years ago |
![]() |
deb0c486c7 | 3 years ago |
![]() |
d677c7f61d | 3 years ago |
![]() |
446ccf3ab1 | 3 years ago |
@ -0,0 +1,162 @@
|
||||
#include "common.h"
|
||||
|
||||
// third-party utilities
|
||||
// use your favorite implementations
|
||||
#define DR_WAV_IMPLEMENTATION
|
||||
#include "dr_wav.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <regex>
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
|
||||
std::string trim(const std::string & s) {
|
||||
std::regex e("^\\s+|\\s+$");
|
||||
return std::regex_replace(s, e, "");
|
||||
}
|
||||
|
||||
std::string replace(const std::string & s, const std::string & from, const std::string & to) {
|
||||
std::string result = s;
|
||||
size_t pos = 0;
|
||||
while ((pos = result.find(from, pos)) != std::string::npos) {
|
||||
result.replace(pos, from.length(), to);
|
||||
pos += to.length();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
|
||||
drwav wav;
|
||||
std::vector<uint8_t> wav_data; // used for pipe input from stdin
|
||||
|
||||
if (fname == "-") {
|
||||
{
|
||||
uint8_t buf[1024];
|
||||
while (true)
|
||||
{
|
||||
const size_t n = fread(buf, 1, sizeof(buf), stdin);
|
||||
if (n == 0) {
|
||||
break;
|
||||
}
|
||||
wav_data.insert(wav_data.end(), buf, buf + n);
|
||||
}
|
||||
}
|
||||
|
||||
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
|
||||
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
||||
}
|
||||
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
|
||||
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (wav.channels != 1 && wav.channels != 2) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (stereo && wav.channels != 2) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (wav.bitsPerSample != 16) {
|
||||
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
||||
|
||||
std::vector<int16_t> pcm16;
|
||||
pcm16.resize(n*wav.channels);
|
||||
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
|
||||
drwav_uninit(&wav);
|
||||
|
||||
// convert to mono, float
|
||||
pcmf32.resize(n);
|
||||
if (wav.channels == 1) {
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
pcmf32[i] = float(pcm16[i])/32768.0f;
|
||||
}
|
||||
} else {
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
|
||||
}
|
||||
}
|
||||
|
||||
if (stereo) {
|
||||
// convert to stereo, float
|
||||
pcmf32s.resize(2);
|
||||
|
||||
pcmf32s[0].resize(n);
|
||||
pcmf32s[1].resize(n);
|
||||
for (uint64_t i = 0; i < n; i++) {
|
||||
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
|
||||
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
|
||||
const float rc = 1.0f / (2.0f * M_PI * cutoff);
|
||||
const float dt = 1.0f / sample_rate;
|
||||
const float alpha = dt / (rc + dt);
|
||||
|
||||
float y = data[0];
|
||||
|
||||
for (size_t i = 1; i < data.size(); i++) {
|
||||
y = alpha * (y + data[i] - data[i - 1]);
|
||||
data[i] = y;
|
||||
}
|
||||
}
|
||||
|
||||
bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
|
||||
const int n_samples = pcmf32.size();
|
||||
const int n_samples_last = (sample_rate * last_ms) / 1000;
|
||||
|
||||
if (n_samples_last >= n_samples) {
|
||||
// not enough samples - assume no speech
|
||||
return false;
|
||||
}
|
||||
|
||||
if (freq_thold > 0.0f) {
|
||||
high_pass_filter(pcmf32, freq_thold, sample_rate);
|
||||
}
|
||||
|
||||
float energy_all = 0.0f;
|
||||
float energy_last = 0.0f;
|
||||
|
||||
for (int i = 0; i < n_samples; i++) {
|
||||
energy_all += fabsf(pcmf32[i]);
|
||||
|
||||
if (i >= n_samples - n_samples_last) {
|
||||
energy_last += fabsf(pcmf32[i]);
|
||||
}
|
||||
}
|
||||
|
||||
energy_all /= n_samples;
|
||||
energy_last /= n_samples_last;
|
||||
|
||||
if (verbose) {
|
||||
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
|
||||
}
|
||||
|
||||
if (energy_last > vad_thold*energy_all) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
// needs to match WHISPER_SAMPLE_RATE
|
||||
#define COMMON_SAMPLE_RATE 16000
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
std::string trim(const std::string & s);
|
||||
|
||||
std::string replace(
|
||||
const std::string & s,
|
||||
const std::string & from,
|
||||
const std::string & to);
|
||||
|
||||
// Read WAV audio file and store the PCM data into pcmf32
|
||||
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
|
||||
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
|
||||
bool read_wav(
|
||||
const std::string & fname,
|
||||
std::vector<float> & pcmf32,
|
||||
std::vector<std::vector<float>> & pcmf32s,
|
||||
bool stereo);
|
||||
|
||||
// Apply a high-pass frequency filter to PCM audio
|
||||
// Suppresses frequencies below cutoff Hz
|
||||
void high_pass_filter(
|
||||
std::vector<float> & data,
|
||||
float cutoff,
|
||||
float sample_rate);
|
||||
|
||||
// Basic voice activity detection (VAD) using audio energy adaptive threshold
|
||||
bool vad_simple(
|
||||
std::vector<float> & pcmf32,
|
||||
int sample_rate,
|
||||
int last_ms,
|
||||
float vad_thold,
|
||||
float freq_thold,
|
||||
bool verbose);
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,475 @@
|
||||
// quantized matrix multiplication
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
#include "arm_neon.h"
|
||||
#endif
|
||||
|
||||
#ifndef MIN
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
const int M = 1280;
|
||||
const int N = 1536;
|
||||
const int K = 1280;
|
||||
|
||||
const int QK = 64;
|
||||
#define QB 7
|
||||
|
||||
//#define GGML_GQ_USE_FP16_SCALE
|
||||
|
||||
#if defined(GGML_GQ_USE_FP16_SCALE)
|
||||
#define gq_scale_t ggml_fp16_t
|
||||
#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x)
|
||||
#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x)
|
||||
#else
|
||||
#define gq_scale_t float
|
||||
#define GGML_FP32_TO_GQ(x) (x)
|
||||
#define GGML_GQ_TO_FP32(x) (x)
|
||||
#endif
|
||||
|
||||
#define gq_quant_t uint64_t
|
||||
#define gq_t_bits 64
|
||||
|
||||
uint64_t get_time_us() {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec * 1000000 + tv.tv_usec;
|
||||
}
|
||||
|
||||
//
|
||||
// naive implementation
|
||||
//
|
||||
|
||||
void mul_mat_f32_naive(
|
||||
const float * restrict src0, // M x K
|
||||
const float * restrict src1, // N x K (transposed)
|
||||
float * dst,
|
||||
int m, int n, int k) {
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < n; j++) {
|
||||
float sum = 0;
|
||||
for (int l = 0; l < k; l++) {
|
||||
sum += src0[i*k + l] * src1[j*k + l];
|
||||
}
|
||||
dst[i*n + j] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// method 1
|
||||
//
|
||||
|
||||
void quantize_1(const float * src, void * dst, int n, int k) {
|
||||
char * p0 = dst;
|
||||
|
||||
gq_quant_t pp[QB];
|
||||
|
||||
for (int j = 0; j < n; j++) {
|
||||
for (int i = 0; i < k/QK; i++) {
|
||||
float min = FLT_MAX;
|
||||
float max = -FLT_MAX;
|
||||
|
||||
// find min/max
|
||||
#ifdef __ARM_NEON
|
||||
{
|
||||
float32x4_t minv = vdupq_n_f32(FLT_MAX);
|
||||
float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
|
||||
|
||||
for (int l = 0; l < QK; l += 4) {
|
||||
float32x4_t v = vld1q_f32(src + j*k + i*QK + l);
|
||||
minv = vminq_f32(minv, v);
|
||||
maxv = vmaxq_f32(maxv, v);
|
||||
}
|
||||
|
||||
float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
|
||||
float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
|
||||
|
||||
min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
|
||||
max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
|
||||
|
||||
//printf("SIMD min/max: %f %f\n", min, max);
|
||||
}
|
||||
#else
|
||||
{
|
||||
for (int l = 0; l < QK; l++) {
|
||||
const float v = src[j*k + i*QK + l];
|
||||
if (v < min) min = v;
|
||||
if (v > max) max = v;
|
||||
}
|
||||
|
||||
//printf("NORM min/max: %f %f\n", min, max);
|
||||
}
|
||||
#endif
|
||||
|
||||
const float d = (max - min) / ((1 << QB) - 1);
|
||||
const float id = d ? 1.0/d : 0.0;
|
||||
|
||||
memcpy(p0, &min, sizeof(float)); p0 += sizeof(float);
|
||||
memcpy(p0, &d, sizeof(float)); p0 += sizeof(float);
|
||||
|
||||
//printf("min/max/d/id: %f %f %f %f\n", min, max, d, id);
|
||||
|
||||
for (int s = 0; s < QK/gq_t_bits; ++s) {
|
||||
memset(pp, 0, sizeof(pp));
|
||||
|
||||
for (int l = 0; l < gq_t_bits; l++) {
|
||||
const float v = src[j*k + i*QK + s*gq_t_bits + l];
|
||||
const uint8_t q = (v - min)*id;
|
||||
|
||||
for (int b = 0; b < QB; b++) {
|
||||
pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int b = 0; b < QB; b++) {
|
||||
memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mul_mat_gq_1(
|
||||
const void * src0,
|
||||
const void * src1,
|
||||
float * dst,
|
||||
int m, int n, int k) {
|
||||
const int kp = k & ~(gq_t_bits - 1);
|
||||
|
||||
const char * restrict p0 = src0;
|
||||
const char * restrict p1 = src1;
|
||||
|
||||
float s0[QB + 1];
|
||||
float s1[QB + 1];
|
||||
|
||||
gq_quant_t m0[QB + 1];
|
||||
gq_quant_t m1[QB + 1];
|
||||
|
||||
for (int ir0 = 0; ir0 < m; ir0++) {
|
||||
for (int ir1 = 0; ir1 < n; ir1++) {
|
||||
float sumf = 0.0;
|
||||
|
||||
const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
|
||||
const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
|
||||
|
||||
for (int i = 0; i < kp/QK; i++) {
|
||||
float min0, d0;
|
||||
memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float);
|
||||
memcpy(&d0, pp0, sizeof(float)); pp0 += sizeof(float);
|
||||
|
||||
float min1, d1;
|
||||
memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float);
|
||||
memcpy(&d1, pp1, sizeof(float)); pp1 += sizeof(float);
|
||||
|
||||
//printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1);
|
||||
|
||||
#if 1
|
||||
// >>> General case for any QB
|
||||
|
||||
s0[0] = min0;
|
||||
s1[0] = min1;
|
||||
|
||||
for (int b = 0; b < QB; b++) {
|
||||
s0[b + 1] = d0*(1 << b);
|
||||
s1[b + 1] = d1*(1 << b);
|
||||
}
|
||||
|
||||
m0[0] = -1ULL;
|
||||
m1[0] = -1ULL;
|
||||
|
||||
for (int s = 0; s < QK/gq_t_bits; ++s) {
|
||||
for (int b = 0; b < QB; b++) {
|
||||
memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t);
|
||||
memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t);
|
||||
}
|
||||
|
||||
for (int q0 = 0; q0 < QB + 1; q0++) {
|
||||
for (int q1 = 0; q1 < QB + 1; q1++) {
|
||||
sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
#endif
|
||||
}
|
||||
|
||||
dst[ir0*n + ir1] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// method 2
|
||||
//
|
||||
|
||||
static inline int quantize_2_blocks_per_row(int k) {
|
||||
return k/QK;
|
||||
}
|
||||
|
||||
static inline int quantize_2_quants_per_block() {
|
||||
return QK/gq_t_bits;
|
||||
}
|
||||
|
||||
static inline int quantize_2_row_size(int k) {
|
||||
const int nb = quantize_2_blocks_per_row(k);
|
||||
const int nq = quantize_2_quants_per_block();
|
||||
|
||||
return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
|
||||
}
|
||||
|
||||
void quantize_2_row(const float * restrict src, void * restrict dst, int k) {
|
||||
assert(k % QK == 0);
|
||||
|
||||
const int nb = quantize_2_blocks_per_row(k);
|
||||
const int nq = quantize_2_quants_per_block();
|
||||
|
||||
gq_scale_t * restrict pm = (gq_scale_t *) (dst);
|
||||
gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
|
||||
gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
|
||||
|
||||
gq_quant_t pp[QB];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float min = FLT_MAX;
|
||||
float max = -FLT_MAX;
|
||||
|
||||
for (int l = 0; l < QK; l++) {
|
||||
const float v = src[i*QK + l];
|
||||
if (v < min) min = v;
|
||||
if (v > max) max = v;
|
||||
}
|
||||
|
||||
const float d = (max - min) / ((1 << QB) - 1);
|
||||
const float id = d ? 1.0/d : 0.0;
|
||||
|
||||
pm[i] = GGML_FP32_TO_GQ(min);
|
||||
pd[i] = GGML_FP32_TO_GQ(d);
|
||||
|
||||
for (int s = 0; s < nq; ++s) {
|
||||
memset(pp, 0, sizeof(pp));
|
||||
|
||||
for (int l = 0; l < gq_t_bits; l++) {
|
||||
const float v = src[i*QK + s*gq_t_bits + l];
|
||||
const uint8_t q = (v - min)*id;
|
||||
|
||||
for (int b = 0; b < QB; b++) {
|
||||
pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int b = 0; b < QB; b++) {
|
||||
pb[i*nq*QB + s*QB + b] = pp[b];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reimplementation of quantize_2 using quantize_2_row
|
||||
void quantize_2(const float * restrict src, char * restrict dst, int n, int k) {
|
||||
assert(k % QK == 0);
|
||||
|
||||
for (int j = 0; j < n; j++) {
|
||||
quantize_2_row(src + j*k, dst, k);
|
||||
dst = (char *) dst + quantize_2_row_size(k);
|
||||
}
|
||||
}
|
||||
|
||||
void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
|
||||
float sumf[(QB + 1)*(QB + 1)];
|
||||
memset(sumf, 0, sizeof(sumf));
|
||||
|
||||
const int nb = quantize_2_blocks_per_row(n);
|
||||
const int nq = quantize_2_quants_per_block();
|
||||
|
||||
const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
|
||||
const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
|
||||
|
||||
const gq_scale_t * restrict pd0 = pm0 + nb;
|
||||
const gq_scale_t * restrict pd1 = pm1 + nb;
|
||||
|
||||
const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
|
||||
const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
|
||||
|
||||
#if 1
|
||||
float s0[QB + 1];
|
||||
float s1[QB + 1];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float m0 = GGML_GQ_TO_FP32(pm0[i]);
|
||||
const float d0 = GGML_GQ_TO_FP32(pd0[i]);
|
||||
|
||||
const float m1 = GGML_GQ_TO_FP32(pm1[i]);
|
||||
const float d1 = GGML_GQ_TO_FP32(pd1[i]);
|
||||
|
||||
s0[0] = m0;
|
||||
s1[0] = m1;
|
||||
|
||||
for (int b = 0; b < QB; b++) {
|
||||
s0[b + 1] = d0*(1 << b);
|
||||
s1[b + 1] = d1*(1 << b);
|
||||
}
|
||||
|
||||
for (int s = 0; s < nq; ++s) {
|
||||
for (int q0 = 0; q0 < QB + 1; q0++) {
|
||||
const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL;
|
||||
for (int q1 = 0; q1 < QB + 1; q1++) {
|
||||
const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL;
|
||||
sumf[q0*(QB + 1) + q1] += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
// SIMD-ify with the assumptions:
|
||||
// - nb is a multiple of 4
|
||||
// - gq_scale_t is float
|
||||
// - gq_quant_t is uint64_t
|
||||
// - QB == 7
|
||||
assert(nb % 4 == 0);
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
#else
|
||||
// TODO
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
for (int q0 = 0; q0 < QB + 1; q0++) {
|
||||
for (int q1 = 1; q1 < QB + 1; q1++) {
|
||||
sumf[q0*(QB + 1)] += sumf[q0*(QB + 1) + q1];
|
||||
}
|
||||
}
|
||||
|
||||
*s = sumf[0];
|
||||
for (int q0 = 1; q0 < QB + 1; q0++) {
|
||||
*s += sumf[q0*(QB + 1)];
|
||||
}
|
||||
}
|
||||
|
||||
// use vec_dot_gq_2 to compute the dot product of two rows
|
||||
void mul_mat_gq_2(
|
||||
const void * src0,
|
||||
const void * src1, // transposed
|
||||
float * dst,
|
||||
int m, int n, int k) {
|
||||
assert(k % QK == 0);
|
||||
|
||||
const int nb = quantize_2_blocks_per_row(k);
|
||||
const int nq = quantize_2_quants_per_block();
|
||||
|
||||
for (int ir0 = 0; ir0 < m; ir0++) {
|
||||
for (int ir1 = 0; ir1 < n; ir1++) {
|
||||
vec_dot_gq_2(k, dst + ir1, src0, src1);
|
||||
src1 = (const char *) src1 + quantize_2_row_size(k);
|
||||
}
|
||||
src0 = (const char *) src0 + quantize_2_row_size(k);
|
||||
src1 = (const char *) src1 - n*quantize_2_row_size(k);
|
||||
|
||||
dst = (float *) dst + n;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
assert(sizeof(gq_quant_t)*8 == gq_t_bits);
|
||||
|
||||
float * src0 = (float *)malloc(sizeof(float)*M*K);
|
||||
float * src1 = (float *)malloc(sizeof(float)*N*K);
|
||||
float * dst = (float *)malloc(sizeof(float)*M*N);
|
||||
|
||||
for (int i = 0; i < M*K; i++) {
|
||||
src0[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
|
||||
for (int i = 0; i < N*K; i++) {
|
||||
src1[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
|
||||
void * src0_gq = calloc(1, quantize_2_row_size(K)*M);
|
||||
void * src1_gq = calloc(1, quantize_2_row_size(K)*N);
|
||||
|
||||
const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K;
|
||||
const size_t sizegq = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N;
|
||||
|
||||
printf("compression: %f\n", (float)sizegq/sizef16);
|
||||
|
||||
int method = 0;
|
||||
if (argc > 1) {
|
||||
method = atoi(argv[1]);
|
||||
}
|
||||
|
||||
// convert fp32 -> gq
|
||||
{
|
||||
const uint64_t t_start = get_time_us();
|
||||
|
||||
if (method == 1) {
|
||||
quantize_1(src0, src0_gq, M, K);
|
||||
quantize_1(src1, src1_gq, N, K);
|
||||
}
|
||||
|
||||
if (method == 2) {
|
||||
quantize_2(src0, src0_gq, M, K);
|
||||
quantize_2(src1, src1_gq, N, K);
|
||||
}
|
||||
|
||||
const uint64_t t_end = get_time_us();
|
||||
printf("convert time: %f ms / method = %d\n", (t_end - t_start) / 1000.0, method);
|
||||
}
|
||||
|
||||
const int nIter = 1;
|
||||
|
||||
const clock_t start = clock();
|
||||
const uint64_t start_us = get_time_us();
|
||||
|
||||
double iM = 1.0/M;
|
||||
double sum = 0.0f;
|
||||
for (int i = 0; i < nIter; i++) {
|
||||
if (method == 0) {
|
||||
mul_mat_f32_naive(src0, src1, dst, M, N, K);
|
||||
}
|
||||
|
||||
if (method == 1) {
|
||||
mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K);
|
||||
}
|
||||
|
||||
if (method == 2) {
|
||||
mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
sum += dst[i]*iM;
|
||||
}
|
||||
|
||||
{
|
||||
const clock_t end = clock();
|
||||
const uint64_t end_us = get_time_us();
|
||||
printf("%s: elapsed ticks: %ld\n", __func__, end - start);
|
||||
printf("%s: elapsed us: %d / %f ms\n", __func__, (int)(end_us - start_us), (end_us - start_us) / 1000.0 / nIter);
|
||||
}
|
||||
|
||||
printf("%f\n", sum);
|
||||
|
||||
free(src0);
|
||||
free(src1);
|
||||
free(dst);
|
||||
|
||||
free(src0_gq);
|
||||
free(src1_gq);
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,218 @@
|
||||
// SVD dimensionality reduction
|
||||
|
||||
#include <float.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#endif
|
||||
|
||||
float frand() {
|
||||
return (float) rand() / (float) RAND_MAX;
|
||||
}
|
||||
|
||||
//int sgesvd_(char *__jobu, char *__jobvt, __CLPK_integer *__m,
|
||||
// __CLPK_integer *__n, __CLPK_real *__a, __CLPK_integer *__lda,
|
||||
// __CLPK_real *__s, __CLPK_real *__u, __CLPK_integer *__ldu,
|
||||
// __CLPK_real *__vt, __CLPK_integer *__ldvt, __CLPK_real *__work,
|
||||
// __CLPK_integer *__lwork,
|
||||
// __CLPK_integer *__info)
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
int m = 10;
|
||||
int n = 5;
|
||||
|
||||
float * A = (float *) malloc(n * m * sizeof(float));
|
||||
float * A0 = (float *) malloc(n * m * sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < m; ++j) {
|
||||
A[i * m + j] = (float) (10.0f*(i + 1) + 1.0f * frand());
|
||||
//A[i * m + j] = (float) (10.0f*(i%2 + 1) + 0.1f * frand());
|
||||
//if (i == 2) {
|
||||
// A[i * m + j] += 20*frand();
|
||||
//}
|
||||
if ((i == 1 || i == 3) && j > m/2) {
|
||||
A[i * m + j] = -A[i * m + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// average vector
|
||||
//float * M = (float *) malloc(m * sizeof(float));
|
||||
|
||||
//{
|
||||
// for (int j = 0; j < m; ++j) {
|
||||
// M[j] = 0.0f;
|
||||
// }
|
||||
// for (int i = 0; i < n; ++i) {
|
||||
// for (int j = 0; j < m; ++j) {
|
||||
// M[j] += A[i * m + j];
|
||||
// }
|
||||
// }
|
||||
// for (int j = 0; j < m; ++j) {
|
||||
// M[j] /= (float) n;
|
||||
// }
|
||||
//}
|
||||
|
||||
//// subtract average vector
|
||||
//for (int i = 0; i < n; ++i) {
|
||||
// for (int j = 0; j < m; ++j) {
|
||||
// A[i * m + j] -= M[j];
|
||||
// }
|
||||
//}
|
||||
|
||||
memcpy(A0, A, n * m * sizeof(float));
|
||||
|
||||
// print A
|
||||
printf("A:\n");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
printf("col %d : ", i);
|
||||
for (int j = 0; j < m; ++j) {
|
||||
printf("%9.5f ", A[i * m + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// SVD
|
||||
// A = U * S * V^T
|
||||
|
||||
float * U = (float *) malloc(n * m * sizeof(float));
|
||||
float * S = (float *) malloc(n * sizeof(float));
|
||||
float * V = (float *) malloc(n * n * sizeof(float));
|
||||
|
||||
int lda = m;
|
||||
int ldu = m;
|
||||
int ldvt = n;
|
||||
|
||||
float work_size;
|
||||
int lwork = -1;
|
||||
int info = 0;
|
||||
|
||||
sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info);
|
||||
|
||||
lwork = (int) work_size;
|
||||
|
||||
printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork);
|
||||
|
||||
float * work = (float *) malloc(lwork * sizeof(float));
|
||||
|
||||
sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info);
|
||||
|
||||
// print U
|
||||
printf("U:\n");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
printf("col %d : ", i);
|
||||
for (int j = 0; j < m; ++j) {
|
||||
printf("%9.5f ", U[i * m + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// normalize S
|
||||
{
|
||||
double sum = 0.0;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sum += S[i];
|
||||
}
|
||||
sum *= sqrt((double) m);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
S[i] /= sum;
|
||||
}
|
||||
}
|
||||
|
||||
// print S
|
||||
printf("S:\n");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
printf("- %d = %9.5f\n", i, S[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// print V
|
||||
printf("V:\n");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
printf("col %d : ", i);
|
||||
for (int j = 0; j < n; ++j) {
|
||||
printf("%9.5f ", V[i * n + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// print A
|
||||
printf("A:\n");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
printf("col %d : ", i);
|
||||
for (int j = 0; j < m; ++j) {
|
||||
printf("%9.5f ", A[i * m + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// compute singular vectors in U
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < m; ++j) {
|
||||
U[i * m + j] *= S[i];
|
||||
}
|
||||
}
|
||||
|
||||
// normalize U
|
||||
for (int i = 0; i < n; ++i) {
|
||||
double sum = 0.0;
|
||||
for (int j = 0; j < m; ++j) {
|
||||
sum += U[i * m + j] * U[i * m + j];
|
||||
}
|
||||
sum = sqrt(sum);
|
||||
for (int j = 0; j < m; ++j) {
|
||||
U[i * m + j] /= sum*sqrt((double) m);
|
||||
}
|
||||
}
|
||||
|
||||
// print U
|
||||
printf("U:\n");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
printf("col %d : ", i);
|
||||
for (int j = 0; j < m; ++j) {
|
||||
printf("%9.5f ", U[i * m + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
|
||||
// project A0 onto U
|
||||
float * A1 = (float *) malloc(n * n * sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < n; ++j) {
|
||||
A1[i * n + j] = 0.0f;
|
||||
for (int k = 0; k < m; ++k) {
|
||||
A1[i * n + j] += A0[i * m + k] * U[j * m + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print A1
|
||||
printf("A1:\n");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
printf("col %d : ", i);
|
||||
for (int j = 0; j < n; ++j) {
|
||||
printf("%9.5f ", A1[i * n + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in new issue