Merge branch 'ggerganov:master' into master

pull/468/head
Qianhe Chen 2 years ago committed by GitHub
commit 8dc946bd81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -3,11 +3,16 @@ on:
push:
paths:
- bindings/go/**
- whisper.h
pull_request:
paths:
- bindings/go/**
- whisper.h
jobs:
ubuntu-latest:
runs-on: ubuntu-latest
steps:
ubuntu-latest:
runs-on: ubuntu-latest
steps:
- uses: actions/setup-go@v3
with:
go-version: '^1.19'

@ -1,267 +1,267 @@
name: CI
on: [push]
on: [push, pull_request]
jobs:
ubuntu-latest:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install libsdl2-dev
- name: Build
run: |
make
make stream
macOS-latest:
runs-on: macOS-latest
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
brew update
brew install sdl2
- name: Build
run: |
make
make stream
ubuntu-latest-gcc:
runs-on: ubuntu-latest
strategy:
matrix:
build: [Debug, Release]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
- name: Configure
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
- name: Build
run: |
make
ctest -L gh --output-on-failure
ubuntu-latest-clang:
runs-on: ubuntu-latest
strategy:
matrix:
build: [Debug, Release]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
- name: Configure
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
- name: Build
run: |
make
ctest -L gh --output-on-failure
ubuntu-latest-gcc-sanitized:
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install cmake
- name: Configure
run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
- name: Build
run: |
make
ctest -L gh --output-on-failure
windows:
runs-on: windows-latest
strategy:
matrix:
build: [Release]
arch: [Win32, x64]
sdl2: [ON]
include:
- arch: Win32
s2arc: x86
- arch: x64
s2arc: x64
- sdl2: ON
s2ver: 2.26.0
steps:
- name: Clone
uses: actions/checkout@v1
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
- name: Build
run: |
cd ./build
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
- name: Upload binaries
if: matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v1
with:
name: whisper-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
windows-blas:
runs-on: windows-latest
strategy:
matrix:
build: [Release]
arch: [Win32, x64]
blas: [ON]
sdl2: [ON]
include:
- arch: Win32
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
s2arc: x86
- arch: x64
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
s2arc: x64
- sdl2: ON
s2ver: 2.26.0
steps:
- name: Clone
uses: actions/checkout@v1
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1
- name: Fetch OpenBLAS
if: matrix.blas == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
7z x blas.zip -oblas -y
copy blas/include/cblas.h .
copy blas/include/openblas_config.h .
echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
-DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
- name: Build
run: |
cd ./build
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
- name: Copy libopenblas.dll
if: matrix.blas == 'ON'
run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
- name: Upload binaries
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v1
with:
name: whisper-blas-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
emscripten:
runs-on: ubuntu-latest
strategy:
matrix:
build: [Release]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
tar -xvf master.tar.gz
emsdk-master/emsdk update
emsdk-master/emsdk install latest
emsdk-master/emsdk activate latest
- name: Configure
run: echo "tmp"
- name: Build
run: |
pushd emsdk-master
source ./emsdk_env.sh
popd
emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
make
ubuntu-latest:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install libsdl2-dev
- name: Build
run: |
make
make stream
macOS-latest:
runs-on: macOS-latest
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
brew update
brew install sdl2
- name: Build
run: |
make
make stream
ubuntu-latest-gcc:
runs-on: ubuntu-latest
strategy:
matrix:
build: [Debug, Release]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
- name: Configure
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
- name: Build
run: |
make
ctest -L gh --output-on-failure
ubuntu-latest-clang:
runs-on: ubuntu-latest
strategy:
matrix:
build: [Debug, Release]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
- name: Configure
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
- name: Build
run: |
make
ctest -L gh --output-on-failure
ubuntu-latest-gcc-sanitized:
runs-on: ubuntu-latest
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install cmake
- name: Configure
run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
- name: Build
run: |
make
ctest -L gh --output-on-failure
windows:
runs-on: windows-latest
strategy:
matrix:
build: [Release]
arch: [Win32, x64]
sdl2: [ON]
include:
- arch: Win32
s2arc: x86
- arch: x64
s2arc: x64
- sdl2: ON
s2ver: 2.26.0
steps:
- name: Clone
uses: actions/checkout@v1
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
- name: Build
run: |
cd ./build
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
- name: Upload binaries
if: matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v1
with:
name: whisper-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
windows-blas:
runs-on: windows-latest
strategy:
matrix:
build: [Release]
arch: [Win32, x64]
blas: [ON]
sdl2: [ON]
include:
- arch: Win32
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
s2arc: x86
- arch: x64
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
s2arc: x64
- sdl2: ON
s2ver: 2.26.0
steps:
- name: Clone
uses: actions/checkout@v1
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1
- name: Fetch OpenBLAS
if: matrix.blas == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
7z x blas.zip -oblas -y
copy blas/include/cblas.h .
copy blas/include/openblas_config.h .
echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
-DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
- name: Build
run: |
cd ./build
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
- name: Copy libopenblas.dll
if: matrix.blas == 'ON'
run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
- name: Upload binaries
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v1
with:
name: whisper-blas-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
emscripten:
runs-on: ubuntu-latest
strategy:
matrix:
build: [Release]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
tar -xvf master.tar.gz
emsdk-master/emsdk update
emsdk-master/emsdk install latest
emsdk-master/emsdk activate latest
- name: Configure
run: echo "tmp"
- name: Build
run: |
pushd emsdk-master
source ./emsdk_env.sh
popd
emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
make

1
.gitignore vendored

@ -1,4 +1,5 @@
*.o
*.a
.cache/
.vs/
.vscode/

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.0)
project(whisper.cpp VERSION 1.1.1)
project(whisper.cpp VERSION 1.2.0)
# Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

@ -4,7 +4,7 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Stable: [v1.1.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.1.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -13,7 +13,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
- AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision
- Low memory usage (Flash Attention + Flash Forward)
- Low memory usage (Flash Attention)
- Zero memory allocations at runtime
- Runs on the CPU
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
@ -89,35 +89,37 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
usage: ./main [options] file0.wav file1.wav ...
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-bo N, --best-of N [5 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-owts, --output-words [false ] output script for generating karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [true ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-bo N, --best-of N [5 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-owts, --output-words [false ] output script for generating karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [true ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
bash ./models/download-ggml-model.sh base.en
@ -137,7 +139,8 @@ Running base.en on all samples in ./samples ...
[+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
----------------------------------------------
whisper_model_load: loading model from 'models/ggml-base.en.bin'
whisper_init_from_file: loading model from 'models/ggml-base.en.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab = 51864
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 512
@ -150,13 +153,14 @@ whisper_model_load: n_text_layer = 6
whisper_model_load: n_mels = 80
whisper_model_load: f16 = 1
whisper_model_load: type = 2
whisper_model_load: mem required = 215.00 MB (+ 6.00 MB per decoder)
whisper_model_load: kv self size = 5.25 MB
whisper_model_load: kv cross size = 17.58 MB
whisper_model_load: adding 1607 extra tokens
whisper_model_load: mem_required = 506.00 MB
whisper_model_load: ggml ctx size = 140.60 MB
whisper_model_load: memory size = 22.83 MB
whisper_model_load: model ctx = 140.60 MB
whisper_model_load: model size = 140.54 MB
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@ -164,12 +168,13 @@ main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 proc
[00:00:00.000 --> 00:00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
whisper_print_timings: load time = 105.91 ms
whisper_print_timings: mel time = 24.62 ms
whisper_print_timings: sample time = 3.63 ms
whisper_print_timings: encode time = 324.71 ms / 54.12 ms per layer
whisper_print_timings: decode time = 83.58 ms / 13.93 ms per layer
whisper_print_timings: total time = 542.81 ms
whisper_print_timings: fallbacks = 0 p / 0 h
whisper_print_timings: load time = 113.81 ms
whisper_print_timings: mel time = 15.40 ms
whisper_print_timings: sample time = 11.58 ms / 27 runs ( 0.43 ms per run)
whisper_print_timings: encode time = 266.60 ms / 1 runs ( 266.60 ms per run)
whisper_print_timings: decode time = 66.11 ms / 27 runs ( 2.45 ms per run)
whisper_print_timings: total time = 476.31 ms
```
The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
@ -212,11 +217,11 @@ make large
| Model | Disk | Mem | SHA |
| --- | --- | --- | --- |
| tiny | 75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
| tiny | 75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
## Limitations
@ -234,7 +239,8 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
```java
$ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
whisper_model_load: loading model from 'models/ggml-medium.en.bin'
whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
whisper_model_load: loading model
whisper_model_load: n_vocab = 51864
whisper_model_load: n_audio_ctx = 1500
whisper_model_load: n_audio_state = 1024
@ -247,55 +253,60 @@ whisper_model_load: n_text_layer = 24
whisper_model_load: n_mels = 80
whisper_model_load: f16 = 1
whisper_model_load: type = 4
whisper_model_load: mem_required = 2610.00 MB
whisper_model_load: mem required = 1720.00 MB (+ 43.00 MB per decoder)
whisper_model_load: kv self size = 42.00 MB
whisper_model_load: kv cross size = 140.62 MB
whisper_model_load: adding 1607 extra tokens
whisper_model_load: ggml ctx size = 1644.97 MB
whisper_model_load: memory size = 182.62 MB
whisper_model_load: model size = 1462.12 MB
main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, lang = en, task = transcribe, timestamps = 1 ...
[00:00.000 --> 00:08.000] My fellow Americans, this day has brought terrible news and great sadness to our country.
[00:08.000 --> 00:17.000] At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
[00:17.000 --> 00:23.000] A short time later, debris was seen falling from the skies above Texas.
[00:23.000 --> 00:29.000] The Columbia's lost. There are no survivors.
[00:29.000 --> 00:32.000] On board was a crew of seven.
[00:32.000 --> 00:39.000] Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
[00:39.000 --> 00:48.000] Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
[00:48.000 --> 00:52.000] a colonel in the Israeli Air Force.
[00:52.000 --> 00:58.000] These men and women assumed great risk in the service to all humanity.
[00:58.000 --> 01:03.000] In an age when space flight has come to seem almost routine,
[01:03.000 --> 01:07.000] it is easy to overlook the dangers of travel by rocket
[01:07.000 --> 01:12.000] and the difficulties of navigating the fierce outer atmosphere of the Earth.
[01:12.000 --> 01:18.000] These astronauts knew the dangers, and they faced them willingly,
[01:18.000 --> 01:23.000] knowing they had a high and noble purpose in life.
[01:23.000 --> 01:31.000] Because of their courage and daring and idealism, we will miss them all the more.
[01:31.000 --> 01:36.000] All Americans today are thinking as well of the families of these men and women
[01:36.000 --> 01:40.000] who have been given this sudden shock and grief.
[01:40.000 --> 01:45.000] You're not alone. Our entire nation grieves with you,
[01:45.000 --> 01:52.000] and those you love will always have the respect and gratitude of this country.
[01:52.000 --> 01:56.000] The cause in which they died will continue.
[01:56.000 --> 02:04.000] Mankind is led into the darkness beyond our world by the inspiration of discovery
[02:04.000 --> 02:11.000] and the longing to understand. Our journey into space will go on.
[02:11.000 --> 02:16.000] In the skies today, we saw destruction and tragedy.
[02:16.000 --> 02:22.000] Yet farther than we can see, there is comfort and hope.
[02:22.000 --> 02:29.000] In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
[02:29.000 --> 02:35.000] who created all these. He who brings out the starry hosts one by one
[02:35.000 --> 02:39.000] and calls them each by name."
[02:39.000 --> 02:46.000] Because of His great power and mighty strength, not one of them is missing.
[02:46.000 --> 02:55.000] The same Creator who names the stars also knows the names of the seven souls we mourn today.
[02:55.000 --> 03:01.000] The crew of the shuttle Columbia did not return safely to earth,
[03:01.000 --> 03:05.000] yet we can pray that all are safely home.
[03:05.000 --> 03:13.000] May God bless the grieving families, and may God continue to bless America.
[03:13.000 --> 03:41.000] Audio
whisper_print_timings: load time = 575.92 ms
whisper_print_timings: mel time = 230.60 ms
whisper_print_timings: sample time = 73.19 ms
whisper_print_timings: encode time = 19552.61 ms / 814.69 ms per layer
whisper_print_timings: decode time = 13249.96 ms / 552.08 ms per layer
whisper_print_timings: total time = 33686.27 ms
whisper_model_load: model ctx = 1462.35 MB
whisper_model_load: model size = 1462.12 MB
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
[00:00:00.000 --> 00:00:08.000] My fellow Americans, this day has brought terrible news and great sadness to our country.
[00:00:08.000 --> 00:00:17.000] At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
[00:00:17.000 --> 00:00:23.000] A short time later, debris was seen falling from the skies above Texas.
[00:00:23.000 --> 00:00:29.000] The Columbia's lost. There are no survivors.
[00:00:29.000 --> 00:00:32.000] On board was a crew of seven.
[00:00:32.000 --> 00:00:39.000] Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
[00:00:39.000 --> 00:00:48.000] Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
[00:00:48.000 --> 00:00:52.000] a colonel in the Israeli Air Force.
[00:00:52.000 --> 00:00:58.000] These men and women assumed great risk in the service to all humanity.
[00:00:58.000 --> 00:01:03.000] In an age when space flight has come to seem almost routine,
[00:01:03.000 --> 00:01:07.000] it is easy to overlook the dangers of travel by rocket
[00:01:07.000 --> 00:01:12.000] and the difficulties of navigating the fierce outer atmosphere of the Earth.
[00:01:12.000 --> 00:01:18.000] These astronauts knew the dangers, and they faced them willingly,
[00:01:18.000 --> 00:01:23.000] knowing they had a high and noble purpose in life.
[00:01:23.000 --> 00:01:31.000] Because of their courage and daring and idealism, we will miss them all the more.
[00:01:31.000 --> 00:01:36.000] All Americans today are thinking as well of the families of these men and women
[00:01:36.000 --> 00:01:40.000] who have been given this sudden shock and grief.
[00:01:40.000 --> 00:01:45.000] You're not alone. Our entire nation grieves with you,
[00:01:45.000 --> 00:01:52.000] and those you love will always have the respect and gratitude of this country.
[00:01:52.000 --> 00:01:56.000] The cause in which they died will continue.
[00:01:56.000 --> 00:02:04.000] Mankind is led into the darkness beyond our world by the inspiration of discovery
[00:02:04.000 --> 00:02:11.000] and the longing to understand. Our journey into space will go on.
[00:02:11.000 --> 00:02:16.000] In the skies today, we saw destruction and tragedy.
[00:02:16.000 --> 00:02:22.000] Yet farther than we can see, there is comfort and hope.
[00:02:22.000 --> 00:02:29.000] In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
[00:02:29.000 --> 00:02:35.000] who created all these. He who brings out the starry hosts one by one
[00:02:35.000 --> 00:02:39.000] and calls them each by name."
[00:02:39.000 --> 00:02:46.000] Because of His great power and mighty strength, not one of them is missing.
[00:02:46.000 --> 00:02:55.000] The same Creator who names the stars also knows the names of the seven souls we mourn today.
[00:02:55.000 --> 00:03:01.000] The crew of the shuttle Columbia did not return safely to earth,
[00:03:01.000 --> 00:03:05.000] yet we can pray that all are safely home.
[00:03:05.000 --> 00:03:13.000] May God bless the grieving families, and may God continue to bless America.
[00:03:13.000 --> 00:03:19.000] [Silence]
whisper_print_timings: fallbacks = 1 p / 0 h
whisper_print_timings: load time = 569.03 ms
whisper_print_timings: mel time = 146.85 ms
whisper_print_timings: sample time = 238.66 ms / 553 runs ( 0.43 ms per run)
whisper_print_timings: encode time = 18665.10 ms / 9 runs ( 2073.90 ms per run)
whisper_print_timings: decode time = 13090.93 ms / 549 runs ( 23.85 ms per run)
whisper_print_timings: total time = 32733.52 ms
```
</details>
@ -321,14 +332,14 @@ to highlight words with high or low confidence:
## Controlling the length of the generated text segments (experimental)
For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
whisper_model_load: loading model from './models/ggml-base.en.bin'
...
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@ -352,7 +363,7 @@ The `--max-len` argument can be used to obtain word-level timestamps. Simply use
whisper_model_load: loading model from './models/ggml-base.en.bin'
...
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

@ -25,6 +25,8 @@ func Process(model whisper.Model, path string, flags *Flags) error {
return err
}
fmt.Printf("\n%s\n", context.SystemInfo())
// Open the file
fmt.Fprintf(flags.Output(), "Loading %q\n", path)
fh, err := os.Open(path)
@ -64,10 +66,13 @@ func Process(model whisper.Model, path string, flags *Flags) error {
// Process the data
fmt.Fprintf(flags.Output(), " ...processing %q\n", path)
context.ResetTimings()
if err := context.Process(data, cb); err != nil {
return err
}
context.PrintTimings()
// Print out the results
switch {
case flags.GetOut() == "srt":

@ -49,6 +49,10 @@ func (p *Params) SetSpeedup(v bool) {
// Set language id
func (p *Params) SetLanguage(lang int) error {
if lang == -1 {
p.language = nil
return nil
}
str := C.whisper_lang_str(C.int(lang))
if str == nil {
return ErrInvalidLanguage
@ -66,6 +70,11 @@ func (p *Params) Language() int {
return int(C.whisper_lang_id(p.language))
}
// Threads available
func (p *Params) Threads() int {
return int(p.n_threads)
}
// Set number of threads to use
func (p *Params) SetThreads(threads int) {
p.n_threads = C.int(threads)

@ -1,7 +1,9 @@
package whisper
import (
"fmt"
"io"
"runtime"
"strings"
"time"
@ -44,7 +46,10 @@ func (context *context) SetLanguage(lang string) error {
if !context.model.IsMultilingual() {
return ErrModelNotMultilingual
}
if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
if lang == "auto" {
context.params.SetLanguage(-1)
} else if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
return ErrUnsupportedLanguage
} else if err := context.params.SetLanguage(id); err != nil {
return err
@ -59,6 +64,10 @@ func (context *context) IsMultilingual() bool {
// Get language
func (context *context) Language() string {
id := context.params.Language()
if id == -1 {
return "auto"
}
return whisper.Whisper_lang_str(context.params.Language())
}
@ -107,6 +116,36 @@ func (context *context) SetMaxTokensPerSegment(n uint) {
context.params.SetMaxTokensPerSegment(int(n))
}
// ResetTimings resets the mode timings. Should be called before processing
func (context *context) ResetTimings() {
context.model.ctx.Whisper_reset_timings()
}
// PrintTimings prints the model timings to stdout.
func (context *context) PrintTimings() {
context.model.ctx.Whisper_print_timings()
}
// SystemInfo returns the system information
func (context *context) SystemInfo() string {
return fmt.Sprintf("system_info: n_threads = %d / %d | %s\n",
context.params.Threads(),
runtime.NumCPU(),
whisper.Whisper_print_system_info(),
)
}
// Use mel data at offset_ms to try and auto-detect the spoken language
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
// Returns the probabilities of all languages.
func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]float32, error) {
langProbs, err := context.model.ctx.Whisper_lang_auto_detect(offset_ms, n_threads)
if err != nil {
return nil, err
}
return langProbs, nil
}
// Process new sample data and return any errors
func (context *context) Process(data []float32, cb SegmentCallback) error {
if context.model.ctx == nil {

@ -29,7 +29,7 @@ type Model interface {
// Context is the speach recognition context.
type Context interface {
SetLanguage(string) error // Set the language to use for speech recognition.
SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
SetTranslate(bool) // Set translate flag
IsMultilingual() bool // Return true if the model is multilingual.
Language() string // Get language
@ -60,6 +60,12 @@ type Context interface {
IsNOT(Token) bool // Test for "No timestamps" token
IsLANG(Token, string) bool // Test for token associated with a specific language
IsText(Token) bool // Test for text token
// Timings
PrintTimings()
ResetTimings()
SystemInfo() string
}
// Segment is the text result of a speech recognition.

@ -1 +1 @@
Subproject commit 9653b42eb4d6d7ef08f736e20f05c4d24492407b
Subproject commit d5c6d5c8a39703153472055c13902defc7177d22

@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.1.1",
"version": "1.2.0",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {

File diff suppressed because one or more lines are too long

@ -8,7 +8,7 @@ function convertTypedArray(src, type) {
var printTextarea = (function() {
var element = document.getElementById('output');
if (element) element.alue = ''; // clear browser cache
if (element) element.value = ''; // clear browser cache
return function(text) {
if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
console.log(text);
@ -88,11 +88,15 @@ async function fetchRemote(url, cbProgress, cbPrint) {
// - check if the data is already in the IndexedDB
// - if not, fetch it from the remote URL and store it in the IndexedDB
function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
// query the storage quota and print it
navigator.storage.estimate().then(function (estimate) {
cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
});
if (!navigator.storage || !navigator.storage.estimate) {
cbPrint('loadRemote: navigator.storage.estimate() is not supported');
} else {
// query the storage quota and print it
navigator.storage.estimate().then(function (estimate) {
cbPrint('loadRemote: storage quota: ' + estimate.quota + ' bytes');
cbPrint('loadRemote: storage usage: ' + estimate.usage + ' bytes');
});
}
// check if the data is already in the IndexedDB
var rq = indexedDB.open(dbName, dbVersion);

@ -100,7 +100,7 @@ while [ $running -eq 1 ]; do
err=$(cat /tmp/whisper-live.err | wc -l)
done
./main -t 8 -m ./models/ggml-base.en.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
./main -t 8 -m ./models/ggml-${model}.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
sleep 1

@ -9,25 +9,35 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
usage: ./main [options] file0.wav file1.wav ...
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
-tr, --translate [false ] translate from source language to english
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-owts, --output-words [false ] output script for generating karaoke video
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-nt, --no-timestamps [true ] do not print timestamps
-l LANG, --language LANG [en ] spoken language
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-bo N, --best-of N [5 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-su, --speed-up [false ] speed up audio by x2 (reduced accuracy)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
-osrt, --output-srt [false ] output result in a srt file
-owts, --output-words [false ] output script for generating karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [true ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-f FNAME, --file FNAME [ ] input WAV file path
```

@ -53,22 +53,23 @@ void replace_all(std::string & s, const std::string & search, const std::string
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t best_of = 5;
int32_t max_len = 0;
int32_t best_of = 5;
int32_t beam_size = -1;
float word_thold = 0.01f;
float entropy_thold = 2.4f;
float logprob_thold = -1.0f;
float word_thold = 0.01f;
float entropy_thold = 2.40f;
float logprob_thold = -1.00f;
bool speed_up = false;
bool translate = false;
bool diarize = false;
bool no_fallback = false;
bool output_txt = false;
bool output_vtt = false;
bool output_srt = false;
@ -117,6 +118,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
@ -162,6 +164,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
@ -346,9 +349,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
if (text[0] == ' ') {
text = text + sizeof(char); //whisper_full_get_segment_text() returns a string with leading space, point to the next character.
}
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
@ -517,7 +517,7 @@ int main(int argc, char ** argv) {
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@ -650,17 +650,19 @@ int main(int argc, char ** argv) {
wparams.token_timestamps = params.output_wts || params.max_len > 0;
wparams.thold_pt = params.word_thold;
wparams.entropy_thold = params.entropy_thold;
wparams.logprob_thold = params.logprob_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.speed_up = params.speed_up;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
wparams.entropy_thold = params.entropy_thold;
wparams.logprob_thold = params.logprob_thold;
whisper_print_user_data user_data = { &params, &pcmf32s };

123
ggml.c

@ -1258,7 +1258,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
//
struct ggml_object {
size_t offset;
size_t offs;
size_t size;
struct ggml_object * next;
@ -1284,6 +1284,9 @@ struct ggml_context {
struct ggml_object * objects_begin;
struct ggml_object * objects_end;
struct ggml_scratch scratch;
struct ggml_scratch scratch_save;
};
struct ggml_context_container {
@ -1346,7 +1349,7 @@ inline static void ggml_critical_section_end(void) {
void ggml_print_object(const struct ggml_object * obj) {
GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
obj->offset, obj->size, (const void *) obj->next);
obj->offs, obj->size, (const void *) obj->next);
}
void ggml_print_objects(const struct ggml_context * ctx) {
@ -1542,12 +1545,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
}
*ctx = (struct ggml_context) {
.mem_size = params.mem_size,
.mem_buffer = params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
.mem_buffer_owned = params.mem_buffer ? false : true,
.n_objects = 0,
.objects_begin = NULL,
.objects_end = NULL,
/*.mem_size =*/ params.mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.n_objects =*/ 0,
/*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, },
/*.scratch_save =*/ { 0, 0, NULL, },
};
ggml_assert_aligned(ctx->mem_buffer);
@ -1570,7 +1575,7 @@ void ggml_free(struct ggml_context * ctx) {
g_state.contexts[i].used = false;
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
__func__, i, ctx->n_objects, ctx->objects_end->offset + ctx->objects_end->size);
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
if (ctx->mem_buffer_owned) {
free(ctx->mem_buffer);
@ -1589,7 +1594,15 @@ void ggml_free(struct ggml_context * ctx) {
}
size_t ggml_used_mem(const struct ggml_context * ctx) {
return ctx->objects_end->offset + ctx->objects_end->size;
return ctx->objects_end->offs + ctx->objects_end->size;
}
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
ctx->scratch = scratch;
return result;
}
////////////////////////////////////////////////////////////////////////////////
@ -1603,9 +1616,9 @@ struct ggml_tensor * ggml_new_tensor_impl(
// always insert objects at the end of the context's memory pool
struct ggml_object * obj_cur = ctx->objects_end;
const size_t cur_offset = obj_cur == NULL ? 0 : obj_cur->offset;
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
const size_t cur_end = cur_offset + cur_size;
const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
const size_t cur_end = cur_offs + cur_size;
size_t size_needed = 0;
@ -1616,25 +1629,52 @@ struct ggml_tensor * ggml_new_tensor_impl(
}
// align to GGML_MEM_ALIGN
size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
}
size_needed += sizeof(struct ggml_tensor);
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool\n", __func__);
assert(false);
return NULL;
}
char * const mem_buffer = ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
*obj_new = (struct ggml_object) {
.offset = cur_end + GGML_OBJECT_SIZE,
.size = size_needed,
.next = NULL,
};
if (ctx->scratch.data == NULL || data != NULL) {
size_needed += sizeof(struct ggml_tensor);
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
.size = size_needed,
.next = NULL,
};
} else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
assert(false);
return NULL;
}
if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
.size = sizeof(struct ggml_tensor),
.next = NULL,
};
//printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
ctx->scratch.offs += size_needed;
}
if (obj_cur != NULL) {
obj_cur->next = obj_new;
@ -1645,9 +1685,9 @@ struct ggml_tensor * ggml_new_tensor_impl(
ctx->objects_end = obj_new;
//GGML_PRINT_DEBUG("%s: inserted new object at %zu\n", __func__, cur_end);
//printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offset);
struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
ggml_assert_aligned(result);
@ -1690,7 +1730,7 @@ struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
int n_dims,
const int* ne) {
const int * ne) {
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
}
@ -1732,16 +1772,26 @@ struct ggml_tensor * ggml_new_tensor_4d(
}
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
ctx->scratch_save = ctx->scratch;
ctx->scratch.data = NULL;
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
ctx->scratch = ctx->scratch_save;
ggml_set_i32(result, value);
return result;
}
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
ctx->scratch_save = ctx->scratch;
ctx->scratch.data = NULL;
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ctx->scratch = ctx->scratch_save;
ggml_set_f32(result, value);
return result;
@ -2350,7 +2400,7 @@ struct ggml_tensor * ggml_repeat(
result->op = GGML_OP_REPEAT;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->src1 = b;
return result;
}
@ -2966,9 +3016,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
// TODO: when implement backward, fix this:
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
((int32_t *) b->data)[0] = n_past;
struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
result->op = GGML_OP_DIAG_MASK_INF;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -4300,7 +4348,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
const int ne1 = dst->ne[1];
// TODO: find the optimal values for these
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && (
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)
)) {
//printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
return true;
}
@ -7289,6 +7339,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
//printf("cur = %zu\n", cur);
} else {
cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
}

@ -301,6 +301,13 @@ struct ggml_cgraph {
int64_t perf_time_us;
};
// scratch buffer
struct ggml_scratch {
size_t offs;
size_t size;
void * data;
};
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
@ -327,6 +334,8 @@ void ggml_free(struct ggml_context * ctx);
size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save