10% performance boost on ARM

3 changed files with 18 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -218,18 +218,3 @@ Note the use of `--color` to distinguish between user input and generated text.
  know how to utilize it properly. But in any case, you can even disable it with `LLAMA_NO_ACCELERATE=1 make` and the
  performance will be the same, since no BLAS calls are invoked by the current implementation

-### Contributing
-
- There are 2 git branches: [master](https://github.com/ggerganov/llama.cpp/commits/master) and [dev](https://github.com/ggerganov/llama.cpp/commits/dev)
- Contributors can open PRs to either one
- Collaborators can push straight into `dev`, but need to open a PR to get stuff to `master`
- Collaborators will be invited based on contributions
- `dev` branch is considered unstable
- `master` branch is considered stable and approved. 3-rd party projects should use the `master` branch
-
-General principles to follow when writing code:
-
- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
--- a/ggml.c
+++ b/ggml.c
@ -1360,6 +1360,22 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);

        // dot product into int16x8_t
+#if defined(__ARM_FEATURE_DOTPROD)
+        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
+        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
+
+        p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
+        p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
+
+        // scalar
+#if defined(__ARM_FEATURE_QRDMX)
+        sum0 += d0_0*d1_0*vaddvq_s32(p_0);
+        sum1 += d0_1*d1_1*vaddvq_s32(p_1);
+#else
+        sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
+        sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
+#endif
+#else
        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));

@ -1388,6 +1404,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 #else
        sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
        sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
+#endif
 #endif
    }

--- a/main.cpp
+++ b/main.cpp
@ -11,10 +11,8 @@
 #include <string>
 #include <vector>

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
-#endif

 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
@ -749,7 +747,6 @@ bool llama_eval(

 static bool is_interacting = false;

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (!is_interacting) {
@ -759,7 +756,6 @@ void sigint_handler(int signo) {
        }
    }
 }
-#endif

 int main(int argc, char ** argv) {
    ggml_time_init();
@ -826,13 +822,11 @@ int main(int argc, char ** argv) {
    }
    printf("\n");
    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
        struct sigaction sigint_action;
        sigint_action.sa_handler = sigint_handler;
        sigemptyset (&sigint_action.sa_mask);
        sigint_action.sa_flags = 0; 
        sigaction(SIGINT, &sigint_action, NULL);
-#endif

        printf("%s: interactive mode on.\n", __func__);

@ -861,9 +855,7 @@ int main(int argc, char ** argv) {

    if (params.interactive) {
        printf("== Running in interactive mode. ==\n"
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
               " - Press Ctrl+C to interject at any time.\n"
-#endif
               " - Press Return to return control to LLaMa.\n"
               " - If you want to submit another line, end your input in '\\'.\n");
    }
@ -965,15 +957,10 @@ int main(int argc, char ** argv) {
                // currently being interactive 
                bool another_line=true;
                while (another_line) {
-                    fflush(stdout);
                    char buf[256] = {0};
                    int n_read;
                    if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
-                    if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
-                        // presumable empty line, consume the newline
-                        scanf("%*c");
-                        n_read=0;
-                    }
+                    scanf("%255[^\n]%n%*c", buf, &n_read);
                    if(params.use_color) printf(ANSI_COLOR_RESET);

                    if (n_read > 0 && buf[n_read-1]=='\\') {
@ -989,8 +976,6 @@ int main(int argc, char ** argv) {
                    std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

-                    remaining_tokens -= line_inp.size();
-
                    input_noecho = true; // do not echo this again
                }