Compare commits

..

1 Commits

Author SHA1 Message Date
Georgi Gerganov 0ac8651bd6
10% performance boost on ARM
1 year ago

@ -218,18 +218,3 @@ Note the use of `--color` to distinguish between user input and generated text.
know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the
performance will be the same, since no BLAS calls are invoked by the current implementation
### Contributing
- There are 2 git branches: [master](https://github.com/ggerganov/llama.cpp/commits/master) and [dev](https://github.com/ggerganov/llama.cpp/commits/dev)
- Contributors can open PRs to either one
- Collaborators can push straight into `dev`, but need to open a PR to get stuff to `master`
- Collaborators will be invited based on contributions
- `dev` branch is considered unstable
- `master` branch is considered stable and approved. 3-rd party projects should use the `master` branch
General principles to follow when writing code:
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit

@ -1360,6 +1360,22 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
// dot product into int16x8_t
#if defined(__ARM_FEATURE_DOTPROD)
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
// scalar
#if defined(__ARM_FEATURE_QRDMX)
sum0 += d0_0*d1_0*vaddvq_s32(p_0);
sum1 += d0_1*d1_1*vaddvq_s32(p_1);
#else
sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
#endif
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
@ -1388,6 +1404,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
#else
sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
#endif
#endif
}

@ -11,10 +11,8 @@
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#endif
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
@ -749,7 +747,6 @@ bool llama_eval(
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
void sigint_handler(int signo) {
if (signo == SIGINT) {
if (!is_interacting) {
@ -759,7 +756,6 @@ void sigint_handler(int signo) {
}
}
}
#endif
int main(int argc, char ** argv) {
ggml_time_init();
@ -826,13 +822,11 @@ int main(int argc, char ** argv) {
}
printf("\n");
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#endif
printf("%s: interactive mode on.\n", __func__);
@ -861,9 +855,7 @@ int main(int argc, char ** argv) {
if (params.interactive) {
printf("== Running in interactive mode. ==\n"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
" - Press Ctrl+C to interject at any time.\n"
#endif
" - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n");
}
@ -965,15 +957,10 @@ int main(int argc, char ** argv) {
// currently being interactive
bool another_line=true;
while (another_line) {
fflush(stdout);
char buf[256] = {0};
int n_read;
if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
// presumable empty line, consume the newline
scanf("%*c");
n_read=0;
}
scanf("%255[^\n]%n%*c", buf, &n_read);
if(params.use_color) printf(ANSI_COLOR_RESET);
if (n_read > 0 && buf[n_read-1]=='\\') {
@ -989,8 +976,6 @@ int main(int argc, char ** argv) {
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
remaining_tokens -= line_inp.size();
input_noecho = true; // do not echo this again
}

Loading…
Cancel
Save