diff --git a/bindings/ruby/Rakefile b/bindings/ruby/Rakefile new file mode 100644 index 0000000..bd80295 --- /dev/null +++ b/bindings/ruby/Rakefile @@ -0,0 +1,104 @@ +require 'erb' +require 'open3' +require 'rake/clean' +require 'rake/testtask' +require 'rubygems/package' + +CLEAN.include '**/*.o' +CLEAN.include "**/*.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}" +CLOBBER.include 'doc' +CLOBBER.include '**/*.log' +CLOBBER.include '**/Makefile' +CLOBBER.include '**/extconf.h' +CLOBBER.include '**/extconf.h' +CLOBBER.include '**/whisper.*' +CLOBBER.include '**/ggml.*' +CLOBBER.include '**/dr_wav.h' + +BUILD_VERSION=2 +# Determine the current version of the software +if File.read('../../CMakeLists.txt') =~ /project.*\s*VERSION\s*(\d.+)\)/ + CURRENT_VERSION = "#{$1}.#{BUILD_VERSION}" +else + CURRENT_VERSION = "0.0.0.#{BUILD_VERSION}" +end + +def shell(args, opts = {}) + puts "> #{args.join(' ')}" + cmd, live_stream, cwd = args, opts[:live_stdout], opts[:cwd] + Dir.chdir(cwd) { + wait_thr = nil + + Open3.popen3(*cmd) do |stdin, stdout, stderr, thr| + stdin.close + wait_thr = thr # Ruby 1.8 will not yield thr, this will be nil + + while line = stdout.gets do + live_stream.puts(line) if live_stream + end + + while line = stderr.gets do + puts line + end + end + + # prefer process handle directly from popen3, but if not available + # fallback to global. + p_status = wait_thr ? wait_thr.value : $? + exit_code = p_status.exitstatus + error = (exit_code != 0) + } +end + +make_program = (/mswin/ =~ RUBY_PLATFORM) ? 'nmake' : 'make' +MAKECMD = ENV['MAKE_CMD'] || make_program +MAKEOPTS = ENV['MAKE_OPTS'] || '' +WHISPER_SO = "ext/whisper.#{(defined?(RbConfig) ? RbConfig : Config)::MAKEFILE_CONFIG['DLEXT']}" + +file 'ext/Makefile' => 'ext/extconf.rb' do + shell(['ruby', 'extconf.rb', ENV['EXTCONF_OPTS'].to_s], + { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" } + ) +end + +def make(target = '') + shell(["#{MAKECMD}", "#{MAKEOPTS}", "#{target}"].reject(&:empty?), + { live_stdout: STDOUT, cwd: "#{Dir.pwd}/ext" } + ) +end + +# Let make handle dependencies between c/o/so - we'll just run it. +file WHISPER_SO => (['ext/Makefile'] + Dir['ext/*.cpp'] + Dir['ext/*.c'] + Dir['ext/*.h']) do + make +end + +desc "Compile the shared object" +task :compile => [WHISPER_SO] + +desc "Default Task (Test project)" +task :default => :test + +Rake::TestTask.new(:test) do |t| + t.test_files = FileList['tests/test_*.rb'] + t.verbose = false +end + +desc 'Generate gem specification' +task :gemspec do + system("cp ../../LICENSE .") + system("cp ../../README.md .") + tspec = ERB.new(File.read(File.join(File.dirname(__FILE__),'lib','whispercpp.gemspec.erb'))) + File.open(File.join(File.dirname(__FILE__),'whispercpp.gemspec'),'wb') do|f| + f << tspec.result + end +end + +desc 'Build gem' +task :package => :gemspec do + spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec') + spec = nil + # see: http://gist.github.com/16215 + Thread.new { spec = eval("#{spec_source}") }.join + spec.validate + Gem::Package.build(spec) +end diff --git a/bindings/ruby/USAGE.md b/bindings/ruby/USAGE.md new file mode 100644 index 0000000..94107e0 --- /dev/null +++ b/bindings/ruby/USAGE.md @@ -0,0 +1,13 @@ +# Ruby Guide + +We expose Whisper::Context and Whisper::Params. The Context object can be used to transcribe. +Parameters can be set on the Params object to customize how the transcription is generated. + +``` + require 'whisper' + whisper = Whisper::Context.new('ggml-base.en.bin') + params = Whisper::Params.new + whisper.transcribe('jfk.wav', params) {|text| + assert_match /ask not what your country can do for you, ask what you can do for your country/, text + } +``` diff --git a/bindings/ruby/ext/ruby_whisper.cpp b/bindings/ruby/ext/ruby_whisper.cpp index e7416ba..82cd875 100644 --- a/bindings/ruby/ext/ruby_whisper.cpp +++ b/bindings/ruby/ext/ruby_whisper.cpp @@ -1,4 +1,5 @@ #include +#include #include "ruby_whisper.h" #define DR_WAV_IMPLEMENTATION #include "dr_wav.h" @@ -94,6 +95,32 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) { return self; } +struct WhisperFullParallelParams { + ruby_whisper *rw; + ruby_whisper_params *rwp; + std::vector pcmf32; // mono-channel F32 PCM + std::vector> pcmf32s; // stereo-channel F32 PCM +}; + + +static void stop_whisper_unblock(void *args) { + struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args; + fprintf(stderr, "Set running to abort\n"); + whisper_running_abort(object->rw->context); +} + +static VALUE call_whisper_full_parallel(void *args) { + struct WhisperFullParallelParams *object = (struct WhisperFullParallelParams *)args; + + whisper_running_restore(object->rw->context); + + if (whisper_full_parallel(object->rw->context, object->rwp->params, object->pcmf32.data(), object->pcmf32.size(), 1) != 0) { + fprintf(stderr, "failed to process audio\n"); + return INT2FIX(-1); + } + return INT2FIX(0); +} + /* * transcribe a single file * can emit to a block results @@ -114,8 +141,9 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { std::string fname_inp = StringValueCStr(wave_file_path); - std::vector pcmf32; // mono-channel F32 PCM - std::vector> pcmf32s; // stereo-channel F32 PCM + //std::vector pcmf32; // mono-channel F32 PCM + //std::vector> pcmf32s; // stereo-channel F32 PCM + struct WhisperFullParallelParams object; // WAV input - this is directly from main.cpp example { @@ -173,26 +201,26 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { drwav_uninit(&wav); // convert to mono, float - pcmf32.resize(n); + object.pcmf32.resize(n); if (wav.channels == 1) { for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i])/32768.0f; + object.pcmf32[i] = float(pcm16[i])/32768.0f; } } else { for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; + object.pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; } } if (rwp->diarize) { // convert to stereo, float - pcmf32s.resize(2); + object.pcmf32s.resize(2); - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); + object.pcmf32s[0].resize(n); + object.pcmf32s[1].resize(n); for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; + object.pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; + object.pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; } } } @@ -206,10 +234,16 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) { rwp->params.encoder_begin_callback_user_data = &is_aborted; } - if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) { + object.rw = rw; + object.rwp = rwp; + + int r = (int)(VALUE)rb_thread_call_without_gvl((void *(*)(void *))call_whisper_full_parallel, &object, stop_whisper_unblock, &object); + //if (whisper_full_parallel(rw->context, rwp->params, object.pcmf32.data(), pcmf32.size(), 1) != 0) { + if (r != 0) { fprintf(stderr, "failed to process audio\n"); return self; } + const int n_segments = whisper_full_n_segments(rw->context); VALUE output = rb_str_new2(""); for (int i = 0; i < n_segments; ++i) { diff --git a/bindings/ruby/lib/whispercpp.gemspec.erb b/bindings/ruby/lib/whispercpp.gemspec.erb new file mode 100644 index 0000000..e1e6e4b --- /dev/null +++ b/bindings/ruby/lib/whispercpp.gemspec.erb @@ -0,0 +1,40 @@ +Gem::Specification.new do |s| + s.name = "whispercpp" + s.authors = ["Georgi Gerganov", "Todd A. Fisher"] + s.version = '<%= CURRENT_VERSION %>' + s.date = '<%= Time.now.strftime("%Y-%m-%d") %>' + s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby} + s.email = 'todd.fisher@gmail.com' + s.extra_rdoc_files = ['LICENSE', 'README.md'] + <% + files = %w(LICENSE README.md Rakefile ext/extconf.rb) + + Dir["lib/**/**.rb"] + + Dir["ext/**/**.c"] + + Dir["ext/**/**.cpp"] + + Dir["ext/**/**.h"] + + if ENV['BINARY_PACKAGE'] # TODO: we could build binary packages + files += Dir['ext/**/*.{o,so,bundle}'] + end + %> + s.files = <%= files.inspect %> + + #### Load-time details + s.require_paths = ['lib','ext'] + s.summary = %q{Ruby whisper.cpp bindings} + s.test_files = <%= Dir['tests/**/**.rb'].inspect %> + <% unless ENV['BINARY_PACKAGE'] %> + s.extensions << 'ext/extconf.rb' + <% end %> + + #### Documentation and testing. + s.homepage = 'https://github.com/ggerganov/whisper.cpp' + s.rdoc_options = ['--main', '../../README.md'] + + <% if ENV['BINARY_PACKAGE'] %> + s.platform = Gem::Platform::CURRENT + <% else %> + s.platform = Gem::Platform::RUBY + <% end %> + s.licenses = ['MIT'] +end diff --git a/whisper.cpp b/whisper.cpp index 3a21581..464565e 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -603,6 +603,9 @@ struct whisper_context { // [EXPERIMENTAL] speed-up techniques int32_t exp_n_audio_ctx = 0; // 0 - use default + // [EXPERIMENTAL] abort handling + bool running = true; + void use_buf(struct ggml_context * ctx, int i) { #if defined(WHISPER_USE_SCRATCH) size_t last_size = 0; @@ -3654,7 +3657,7 @@ int whisper_full( std::vector beam_candidates; // main loop - while (true) { + while (ctx->running) { const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start); while (progress_cur >= progress_prev + progress_step) { progress_prev += progress_step; @@ -4204,12 +4207,27 @@ int whisper_full( return 0; } +void whisper_running_abort(struct whisper_context * ctx) { + ctx->running = false; +} + +void whisper_running_restore(struct whisper_context * ctx) { + ctx->running = true; +} + +bool whisper_running_state(struct whisper_context * ctx) { + return ctx->running; +} + int whisper_full_parallel( struct whisper_context * ctx, struct whisper_full_params params, const float * samples, int n_samples, int n_processors) { + + ctx->running = true; + if (n_processors == 1) { return whisper_full(ctx, params, samples, n_samples); } diff --git a/whisper.h b/whisper.h index 3eb8d08..2044700 100644 --- a/whisper.h +++ b/whisper.h @@ -225,6 +225,15 @@ extern "C" { // Print system information WHISPER_API const char * whisper_print_system_info(void); + // Abort a running whisper_full_parallel or whisper_full + WHISPER_API void whisper_running_abort(struct whisper_context * ctx); + + // Resume whisper context from an aborted state allowing it run again + WHISPER_API void whisper_running_restore(struct whisper_context * ctx); + + // Check the whisper context state if true then it can run if false it can not + WHISPER_API bool whisper_running_state(struct whisper_context * ctx); + //////////////////////////////////////////////////////////////////////////// // Available sampling strategies