diff --git a/examples/whisper.android/README.md b/examples/whisper.android/README.md new file mode 100644 index 0000000..f209194 --- /dev/null +++ b/examples/whisper.android/README.md @@ -0,0 +1,10 @@ +A sample Android app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions. + +To use: + +1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1] +2. Copy the model to the "app/src/main/assets/models" folder. +3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)). +4. Copy the sample to the "app/src/main/assets/samples" folder. +5. Select the "release" active build variant, and use Android Studio to run and deploy to your device. +[^1]: I recommend the tiny or base models for running on an Android device. \ No newline at end of file diff --git a/examples/whisper.android/app/.gitignore b/examples/whisper.android/app/.gitignore new file mode 100644 index 0000000..42afabf --- /dev/null +++ b/examples/whisper.android/app/.gitignore @@ -0,0 +1 @@ +/build \ No newline at end of file diff --git a/examples/whisper.android/app/build.gradle b/examples/whisper.android/app/build.gradle new file mode 100644 index 0000000..5765cae --- /dev/null +++ b/examples/whisper.android/app/build.gradle @@ -0,0 +1,76 @@ +plugins { + id 'com.android.application' + id 'org.jetbrains.kotlin.android' +} + +android { + namespace 'com.whispercppdemo' + compileSdk 33 + + defaultConfig { + applicationId "com.whispercppdemo" + minSdk 26 + targetSdk 32 + versionCode 1 + versionName "1.0" + + ndk { + abiFilters 'arm64-v8a', 'x86_64' + } + + testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" + vectorDrawables { + useSupportLibrary true + } + } + + buildTypes { + release { + signingConfig signingConfigs.debug + minifyEnabled true + proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' + } + } + compileOptions { + sourceCompatibility JavaVersion.VERSION_1_8 + targetCompatibility JavaVersion.VERSION_1_8 + } + kotlinOptions { + jvmTarget = '1.8' + } + buildFeatures { + compose true + } + composeOptions { + kotlinCompilerExtensionVersion '1.3.1' + } + ndkVersion "25.0.8528842" + externalNativeBuild { + ndkBuild { + path 'src/main/jni/whisper/Android.mk' + } + } + packagingOptions { + resources { + excludes += '/META-INF/{AL2.0,LGPL2.1}' + } + } +} + +dependencies { + implementation 'androidx.activity:activity-compose:1.6.1' + implementation 'androidx.compose.material:material-icons-core:1.3.1' + implementation 'androidx.compose.material3:material3:1.0.1' + implementation "androidx.compose.ui:ui:1.3.2" + implementation "androidx.compose.ui:ui-tooling-preview:1.3.2" + implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.5.1' + implementation "com.google.accompanist:accompanist-permissions:0.28.0" + implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.6.4' + + testImplementation 'junit:junit:4.13.2' + androidTestImplementation 'androidx.test.ext:junit:1.1.4' + androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.0' + androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.3.2" + debugImplementation "androidx.compose.ui:ui-tooling:1.3.2" + debugImplementation "androidx.compose.ui:ui-test-manifest:1.3.2" +} \ No newline at end of file diff --git a/examples/whisper.android/app/proguard-rules.pro b/examples/whisper.android/app/proguard-rules.pro new file mode 100644 index 0000000..481bb43 --- /dev/null +++ b/examples/whisper.android/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile \ No newline at end of file diff --git a/examples/whisper.android/app/src/androidTest/java/com/whispercppdemo/ExampleInstrumentedTest.kt b/examples/whisper.android/app/src/androidTest/java/com/whispercppdemo/ExampleInstrumentedTest.kt new file mode 100644 index 0000000..b7117fc --- /dev/null +++ b/examples/whisper.android/app/src/androidTest/java/com/whispercppdemo/ExampleInstrumentedTest.kt @@ -0,0 +1,24 @@ +package com.whispercppdemo + +import androidx.test.platform.app.InstrumentationRegistry +import androidx.test.ext.junit.runners.AndroidJUnit4 + +import org.junit.Test +import org.junit.runner.RunWith + +import org.junit.Assert.* + +/** + * Instrumented test, which will execute on an Android device. + * + * See [testing documentation](http://d.android.com/tools/testing). + */ +@RunWith(AndroidJUnit4::class) +class ExampleInstrumentedTest { + @Test + fun useAppContext() { + // Context of the app under test. + val appContext = InstrumentationRegistry.getInstrumentation().targetContext + assertEquals("com.whispercppdemo", appContext.packageName) + } +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/AndroidManifest.xml b/examples/whisper.android/app/src/main/AndroidManifest.xml new file mode 100644 index 0000000..cf57850 --- /dev/null +++ b/examples/whisper.android/app/src/main/AndroidManifest.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/MainActivity.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/MainActivity.kt new file mode 100644 index 0000000..8bcae71 --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/MainActivity.kt @@ -0,0 +1,22 @@ +package com.whispercppdemo + +import android.os.Bundle +import androidx.activity.ComponentActivity +import androidx.activity.compose.setContent +import androidx.activity.viewModels +import com.whispercppdemo.ui.main.MainScreen +import com.whispercppdemo.ui.main.MainScreenViewModel +import com.whispercppdemo.ui.theme.WhisperCppDemoTheme + +class MainActivity : ComponentActivity() { + private val viewModel: MainScreenViewModel by viewModels { MainScreenViewModel.factory() } + + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + setContent { + WhisperCppDemoTheme { + MainScreen(viewModel) + } + } + } +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/media/RiffWaveHelper.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/media/RiffWaveHelper.kt new file mode 100644 index 0000000..0947a8c --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/media/RiffWaveHelper.kt @@ -0,0 +1,76 @@ +package com.whispercppdemo.media + +import java.io.ByteArrayOutputStream +import java.io.File +import java.nio.ByteBuffer +import java.nio.ByteOrder + +fun decodeWaveFile(file: File): FloatArray { + val baos = ByteArrayOutputStream() + file.inputStream().use { it.copyTo(baos) } + val buffer = ByteBuffer.wrap(baos.toByteArray()) + buffer.order(ByteOrder.LITTLE_ENDIAN) + buffer.position(44) + val shortBuffer = buffer.asShortBuffer() + val shortArray = ShortArray(shortBuffer.limit()) + shortBuffer.get(shortArray) + return FloatArray(shortArray.size) { index -> + (shortArray[index] / 32767.0f).coerceIn(-1f..1f) + } +} + +fun encodeWaveFile(file: File, data: ShortArray) { + file.outputStream().use { + it.write(headerBytes(data.size * 2)) + val buffer = ByteBuffer.allocate(data.size * 2) + buffer.order(ByteOrder.LITTLE_ENDIAN) + buffer.asShortBuffer().put(data) + val bytes = ByteArray(buffer.limit()) + buffer.get(bytes) + it.write(bytes) + } +} + +private fun headerBytes(totalLength: Int): ByteArray { + require(totalLength >= 44) + ByteBuffer.allocate(44).apply { + order(ByteOrder.LITTLE_ENDIAN) + + put('R'.code.toByte()) + put('I'.code.toByte()) + put('F'.code.toByte()) + put('F'.code.toByte()) + + putInt(totalLength - 8) + + put('W'.code.toByte()) + put('A'.code.toByte()) + put('V'.code.toByte()) + put('E'.code.toByte()) + + put('f'.code.toByte()) + put('m'.code.toByte()) + put('t'.code.toByte()) + put(' '.code.toByte()) + + putInt(16) + putShort(1.toShort()) + putShort(1.toShort()) + putInt(16000) + putInt(32000) + putShort(2.toShort()) + putShort(16.toShort()) + + put('d'.code.toByte()) + put('a'.code.toByte()) + put('t'.code.toByte()) + put('a'.code.toByte()) + + putInt(totalLength - 44) + position(0) + }.also { + val bytes = ByteArray(it.limit()) + it.get(bytes) + return bytes + } +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/recorder/Recorder.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/recorder/Recorder.kt new file mode 100644 index 0000000..68df965 --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/recorder/Recorder.kt @@ -0,0 +1,88 @@ +package com.whispercppdemo.recorder + +import android.annotation.SuppressLint +import android.media.AudioFormat +import android.media.AudioRecord +import android.media.MediaRecorder +import com.whispercppdemo.media.encodeWaveFile +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.withContext +import java.io.File +import java.util.concurrent.Executors +import java.util.concurrent.atomic.AtomicBoolean + +class Recorder { + private val scope: CoroutineScope = CoroutineScope( + Executors.newSingleThreadExecutor().asCoroutineDispatcher() + ) + private var recorder: AudioRecordThread? = null + + suspend fun startRecording(outputFile: File, onError: (Exception) -> Unit) = withContext(scope.coroutineContext) { + recorder = AudioRecordThread(outputFile, onError) + recorder?.start() + } + + suspend fun stopRecording() = withContext(scope.coroutineContext) { + recorder?.stopRecording() + @Suppress("BlockingMethodInNonBlockingContext") + recorder?.join() + recorder = null + } +} + +private class AudioRecordThread( + private val outputFile: File, + private val onError: (Exception) -> Unit +) : + Thread("AudioRecorder") { + private var quit = AtomicBoolean(false) + + @SuppressLint("MissingPermission") + override fun run() { + try { + val bufferSize = AudioRecord.getMinBufferSize( + 16000, + AudioFormat.CHANNEL_IN_MONO, + AudioFormat.ENCODING_PCM_16BIT + ) * 4 + val buffer = ShortArray(bufferSize / 2) + + val audioRecord = AudioRecord( + MediaRecorder.AudioSource.MIC, + 16000, + AudioFormat.CHANNEL_IN_MONO, + AudioFormat.ENCODING_PCM_16BIT, + bufferSize + ) + + try { + audioRecord.startRecording() + + val allData = mutableListOf() + + while (!quit.get()) { + val read = audioRecord.read(buffer, 0, buffer.size) + if (read > 0) { + for (i in 0 until read) { + allData.add(buffer[i]) + } + } else { + throw java.lang.RuntimeException("audioRecord.read returned $read") + } + } + + audioRecord.stop() + encodeWaveFile(outputFile, allData.toShortArray()) + } finally { + audioRecord.release() + } + } catch (e: Exception) { + onError(e) + } + } + + fun stopRecording() { + quit.set(true) + } +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt new file mode 100644 index 0000000..69d4514 --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt @@ -0,0 +1,97 @@ +package com.whispercppdemo.ui.main + +import androidx.compose.foundation.layout.* +import androidx.compose.material3.* +import androidx.compose.runtime.Composable +import androidx.compose.ui.Modifier +import androidx.compose.ui.res.stringResource +import androidx.compose.ui.unit.dp +import com.google.accompanist.permissions.ExperimentalPermissionsApi +import com.google.accompanist.permissions.isGranted +import com.google.accompanist.permissions.rememberPermissionState +import com.whispercppdemo.R + +@Composable +fun MainScreen(viewModel: MainScreenViewModel) { + MainScreen( + canTranscribe = viewModel.canTranscribe, + isRecording = viewModel.isRecording, + messageLog = viewModel.dataLog, + onTranscribeSampleTapped = viewModel::transcribeSample, + onRecordTapped = viewModel::toggleRecord + ) +} + +@OptIn(ExperimentalMaterial3Api::class) +@Composable +private fun MainScreen( + canTranscribe: Boolean, + isRecording: Boolean, + messageLog: String, + onTranscribeSampleTapped: () -> Unit, + onRecordTapped: () -> Unit +) { + Scaffold( + topBar = { + TopAppBar( + title = { Text(stringResource(R.string.app_name)) } + ) + }, + ) { innerPadding -> + Column( + modifier = Modifier + .padding(innerPadding) + .padding(16.dp) + ) { + Row(horizontalArrangement = Arrangement.SpaceBetween) { + TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped) + RecordButton( + enabled = canTranscribe, + isRecording = isRecording, + onClick = onRecordTapped + ) + } + MessageLog(messageLog) + } + } +} + +@Composable +private fun MessageLog(log: String) { + Text(text = log) +} + +@Composable +private fun TranscribeSampleButton(enabled: Boolean, onClick: () -> Unit) { + Button(onClick = onClick, enabled = enabled) { + Text("Transcribe sample") + } +} + +@OptIn(ExperimentalPermissionsApi::class) +@Composable +private fun RecordButton(enabled: Boolean, isRecording: Boolean, onClick: () -> Unit) { + val micPermissionState = rememberPermissionState( + permission = android.Manifest.permission.RECORD_AUDIO, + onPermissionResult = { granted -> + if (granted) { + onClick() + } + } + ) + Button(onClick = { + if (micPermissionState.status.isGranted) { + onClick() + } else { + micPermissionState.launchPermissionRequest() + } + }, enabled = enabled) { + Text( + if (isRecording) { + "Stop recording" + } else { + "Start recording" + } + ) + } +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt new file mode 100644 index 0000000..bf04c5a --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt @@ -0,0 +1,185 @@ +package com.whispercppdemo.ui.main + +import android.app.Application +import android.content.Context +import android.media.MediaPlayer +import android.util.Log +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.setValue +import androidx.core.net.toUri +import androidx.lifecycle.ViewModel +import androidx.lifecycle.ViewModelProvider +import androidx.lifecycle.viewModelScope +import androidx.lifecycle.viewmodel.initializer +import androidx.lifecycle.viewmodel.viewModelFactory +import com.whispercppdemo.media.decodeWaveFile +import com.whispercppdemo.recorder.Recorder +import com.whispercppdemo.whisper.WhisperContext +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.launch +import kotlinx.coroutines.runBlocking +import kotlinx.coroutines.withContext +import java.io.File + +private const val LOG_TAG = "MainScreenViewModel" + +class MainScreenViewModel(private val application: Application) : ViewModel() { + var canTranscribe by mutableStateOf(false) + private set + var dataLog by mutableStateOf("") + private set + var isRecording by mutableStateOf(false) + private set + + private val modelsPath = File(application.filesDir, "models") + private val samplesPath = File(application.filesDir, "samples") + private var recorder: Recorder = Recorder() + private var whisperContext: WhisperContext? = null + private var mediaPlayer: MediaPlayer? = null + private var recordedFile: File? = null + + + init { + viewModelScope.launch { + loadData() + } + } + + private suspend fun loadData() { + printMessage("Loading data...\n") + try { + copyAssets() + loadBaseModel() + canTranscribe = true + } catch (e: Exception) { + Log.w(LOG_TAG, e) + printMessage("${e.localizedMessage}\n") + } + } + + private suspend fun printMessage(msg: String) = withContext(Dispatchers.Main) { + dataLog += msg + } + + private suspend fun copyAssets() = withContext(Dispatchers.IO) { + modelsPath.mkdirs() + samplesPath.mkdirs() + application.copyData("models", modelsPath, ::printMessage) + application.copyData("samples", samplesPath, ::printMessage) + printMessage("All data copied to working directory.\n") + } + + private suspend fun loadBaseModel() = withContext(Dispatchers.IO) { + printMessage("Loading model...\n") + val firstModel = modelsPath.listFiles()!!.first() + whisperContext = WhisperContext.createContext(firstModel.absolutePath) + printMessage("Loaded model ${firstModel.name}.\n") + } + + fun transcribeSample() = viewModelScope.launch { + transcribeAudio(getFirstSample()) + } + + private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) { + samplesPath.listFiles()!!.first() + } + + private suspend fun readAudioSamples(file: File): FloatArray = withContext(Dispatchers.IO) { + mediaPlayer = MediaPlayer.create(application, file.absolutePath.toUri()) + mediaPlayer?.start() + return@withContext decodeWaveFile(file) + } + + private suspend fun transcribeAudio(file: File) { + if (!canTranscribe) { + return + } + + canTranscribe = false + + try { + printMessage("Reading wave samples...\n") + val data = readAudioSamples(file) + printMessage("Transcribing data...\n") + val text = whisperContext?.transcribeData(data) + printMessage("Done: $text\n") + } catch (e: Exception) { + Log.w(LOG_TAG, e) + printMessage("${e.localizedMessage}\n") + } + + canTranscribe = true + } + + fun toggleRecord() = viewModelScope.launch { + try { + if (isRecording) { + recorder.stopRecording() + isRecording = false + recordedFile?.let { transcribeAudio(it) } + } else { + val file = getTempFileForRecording() + recorder.startRecording(file) { e -> + viewModelScope.launch { + withContext(Dispatchers.Main) { + printMessage("${e.localizedMessage}\n") + isRecording = false + } + } + } + isRecording = true + recordedFile = file + } + } catch (e: Exception) { + Log.w(LOG_TAG, e) + printMessage("${e.localizedMessage}\n") + isRecording = false + } + } + + private suspend fun getTempFileForRecording() = withContext(Dispatchers.IO) { + File.createTempFile("recording", "wav") + } + + override fun onCleared() { + runBlocking { + whisperContext?.release() + whisperContext = null + } + + mediaPlayer?.stop() + mediaPlayer?.release() + mediaPlayer = null + } + + companion object { + fun factory() = viewModelFactory { + initializer { + val application = + this[ViewModelProvider.AndroidViewModelFactory.APPLICATION_KEY] as Application + MainScreenViewModel(application) + } + } + } +} + +private suspend fun Context.copyData( + assetDirName: String, + destDir: File, + printMessage: suspend (String) -> Unit +) = withContext(Dispatchers.IO) { + assets.list(assetDirName)?.forEach { name -> + val assetPath = "$assetDirName/$name" + Log.v(LOG_TAG, "Processing $assetPath...") + val destination = File(destDir, name) + Log.v(LOG_TAG, "Copying $assetPath to $destination...") + printMessage("Copying $name...\n") + assets.open(assetPath).use { input -> + destination.outputStream().use { output -> + input.copyTo(output) + } + } + Log.v(LOG_TAG, "Copied $assetPath to $destination") + } +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Color.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Color.kt new file mode 100644 index 0000000..31e1899 --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Color.kt @@ -0,0 +1,11 @@ +package com.whispercppdemo.ui.theme + +import androidx.compose.ui.graphics.Color + +val Purple80 = Color(0xFFD0BCFF) +val PurpleGrey80 = Color(0xFFCCC2DC) +val Pink80 = Color(0xFFEFB8C8) + +val Purple40 = Color(0xFF6650a4) +val PurpleGrey40 = Color(0xFF625b71) +val Pink40 = Color(0xFF7D5260) \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Theme.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Theme.kt new file mode 100644 index 0000000..5f6b5bb --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Theme.kt @@ -0,0 +1,68 @@ +package com.whispercppdemo.ui.theme + +import android.app.Activity +import android.os.Build +import androidx.compose.foundation.isSystemInDarkTheme +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.darkColorScheme +import androidx.compose.material3.dynamicDarkColorScheme +import androidx.compose.material3.dynamicLightColorScheme +import androidx.compose.material3.lightColorScheme +import androidx.compose.runtime.Composable +import androidx.compose.runtime.SideEffect +import androidx.compose.ui.graphics.toArgb +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.platform.LocalView +import androidx.core.view.ViewCompat + +private val DarkColorScheme = darkColorScheme( + primary = Purple80, + secondary = PurpleGrey80, + tertiary = Pink80 +) + +private val LightColorScheme = lightColorScheme( + primary = Purple40, + secondary = PurpleGrey40, + tertiary = Pink40 + + /* Other default colors to override + background = Color(0xFFFFFBFE), + surface = Color(0xFFFFFBFE), + onPrimary = Color.White, + onSecondary = Color.White, + onTertiary = Color.White, + onBackground = Color(0xFF1C1B1F), + onSurface = Color(0xFF1C1B1F), + */ +) + +@Composable +fun WhisperCppDemoTheme( + darkTheme: Boolean = isSystemInDarkTheme(), + // Dynamic color is available on Android 12+ + dynamicColor: Boolean = true, + content: @Composable () -> Unit +) { + val colorScheme = when { + dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { + val context = LocalContext.current + if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) + } + darkTheme -> DarkColorScheme + else -> LightColorScheme + } + val view = LocalView.current + if (!view.isInEditMode) { + SideEffect { + (view.context as Activity).window.statusBarColor = colorScheme.primary.toArgb() + ViewCompat.getWindowInsetsController(view)?.isAppearanceLightStatusBars = darkTheme + } + } + + MaterialTheme( + colorScheme = colorScheme, + typography = Typography, + content = content + ) +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Type.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Type.kt new file mode 100644 index 0000000..5436fe2 --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Type.kt @@ -0,0 +1,34 @@ +package com.whispercppdemo.ui.theme + +import androidx.compose.material3.Typography +import androidx.compose.ui.text.TextStyle +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.unit.sp + +// Set of Material typography styles to start with +val Typography = Typography( + bodyLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 16.sp, + lineHeight = 24.sp, + letterSpacing = 0.5.sp + ) + /* Other default text styles to override + titleLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 22.sp, + lineHeight = 28.sp, + letterSpacing = 0.sp + ), + labelSmall = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Medium, + fontSize = 11.sp, + lineHeight = 16.sp, + letterSpacing = 0.5.sp + ) + */ +) \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt new file mode 100644 index 0000000..69acec1 --- /dev/null +++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt @@ -0,0 +1,61 @@ +package com.whispercppdemo.whisper + +import kotlinx.coroutines.* +import java.util.concurrent.Executors + +class WhisperContext private constructor(private var ptr: Long) { + // Meet Whisper C++ constraint: Don't access from more than one thread at a time. + private val scope: CoroutineScope = CoroutineScope( + Executors.newSingleThreadExecutor().asCoroutineDispatcher() + ) + + suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) { + require(ptr != 0L) + WhisperLib.fullTranscribe(ptr, data) + val textCount = WhisperLib.getTextSegmentCount(ptr) + return@withContext buildString { + for (i in 0 until textCount) { + append(WhisperLib.getTextSegment(ptr, i)) + } + } + } + + suspend fun release() = withContext(scope.coroutineContext) { + if (ptr != 0L) { + WhisperLib.freeContext(ptr) + ptr = 0 + } + } + + protected fun finalize() { + runBlocking { + release() + } + } + + companion object { + fun createContext(filePath: String): WhisperContext { + val ptr = WhisperLib.initContext(filePath) + if (ptr == 0L) { + throw java.lang.RuntimeException("Couldn't create context with path $filePath") + } + return WhisperContext(ptr) + } + } +} + +private class WhisperLib { + companion object { + init { + System.loadLibrary("whisper") + } + + // JNI methods + external fun initContext(modelPath: String): Long + external fun freeContext(contextPtr: Long) + external fun fullTranscribe(contextPtr: Long, audioData: FloatArray) + external fun getTextSegmentCount(contextPtr: Long): Int + external fun getTextSegment(contextPtr: Long, index: Int): String + } +} + diff --git a/examples/whisper.android/app/src/main/jni/whisper/Android.mk b/examples/whisper.android/app/src/main/jni/whisper/Android.mk new file mode 100644 index 0000000..99cd9fc --- /dev/null +++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk @@ -0,0 +1,22 @@ +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) +WHISPER_LIB_DIR := libwhisper +LOCAL_LDLIBS := -llog +LOCAL_MODULE := libwhisper + +# Make the final output library smaller by only keeping the symbols referenced from the app. +ifneq ($(APP_OPTIM),debug) + LOCAL_CFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden + LOCAL_CFLAGS += -ffunction-sections -fdata-sections + LOCAL_LDFLAGS += -Wl,--gc-sections + LOCAL_LDFLAGS += -Wl,--exclude-libs,ALL + LOCAL_LDFLAGS += -flto +endif + +LOCAL_CFLAGS += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR) +LOCAL_CPPFLAGS += -std=c++11 +LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \ + $(WHISPER_LIB_DIR)/whisper.cpp \ + $(LOCAL_PATH)/jni.c + +include $(BUILD_SHARED_LIBRARY) \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/jni/whisper/Application.mk b/examples/whisper.android/app/src/main/jni/whisper/Application.mk new file mode 100644 index 0000000..067c76f --- /dev/null +++ b/examples/whisper.android/app/src/main/jni/whisper/Application.mk @@ -0,0 +1 @@ +APP_STL := c++_static \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/jni/whisper/jni.c b/examples/whisper.android/app/src/main/jni/whisper/jni.c new file mode 100644 index 0000000..e333487 --- /dev/null +++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include "libwhisper/whisper.h" + +#define UNUSED(x) (void)(x) +#define TAG "JNI" + +#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__) + +static inline int min(int a, int b) { + return (a < b) ? a : b; +} + +static inline int max(int a, int b) { + return (a > b) ? a : b; +} + +JNIEXPORT jlong JNICALL +Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext( + JNIEnv *env, jobject thiz, jstring model_path_str) { + UNUSED(thiz); + struct whisper_context *context = NULL; + const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL); + context = whisper_init(model_path_chars); + (*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars); + return (jlong) context; +} + +JNIEXPORT void JNICALL +Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext( + JNIEnv *env, jobject thiz, jlong context_ptr) { + UNUSED(env); + UNUSED(thiz); + struct whisper_context *context = (struct whisper_context *) context_ptr; + whisper_free(context); +} + +JNIEXPORT void JNICALL +Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe( + JNIEnv *env, jobject thiz, jlong context_ptr, jfloatArray audio_data) { + UNUSED(thiz); + struct whisper_context *context = (struct whisper_context *) context_ptr; + jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL); + const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data); + + // Leave 2 processors free (i.e. the high-efficiency cores). + int max_threads = max(1, min(8, get_nprocs() - 2)); + LOGI("Selecting %d threads", max_threads); + + // The below adapted from the Objective-C iOS sample + struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + params.print_realtime = true; + params.print_progress = false; + params.print_timestamps = true; + params.print_special = false; + params.translate = false; + params.language = "en"; + params.n_threads = max_threads; + params.offset_ms = 0; + params.no_context = true; + params.single_segment = false; + + whisper_reset_timings(context); + + LOGI("About to run whisper_full"); + if (whisper_full(context, params, audio_data_arr, audio_data_length) != 0) { + LOGI("Failed to run the model"); + } else { + whisper_print_timings(context); + } + (*env)->ReleaseFloatArrayElements(env, audio_data, audio_data_arr, JNI_ABORT); +} + +JNIEXPORT jint JNICALL +Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegmentCount( + JNIEnv *env, jobject thiz, jlong context_ptr) { + UNUSED(env); + UNUSED(thiz); + struct whisper_context *context = (struct whisper_context *) context_ptr; + return whisper_full_n_segments(context); +} + +JNIEXPORT jstring JNICALL +Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment( + JNIEnv *env, jobject thiz, jlong context_ptr, jint index) { + UNUSED(thiz); + struct whisper_context *context = (struct whisper_context *) context_ptr; + const char *text = whisper_full_get_segment_text(context, index); + jstring string = (*env)->NewStringUTF(env, text); + return string; +} \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/jni/whisper/libwhisper/ggml.c b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/ggml.c new file mode 100644 index 0000000..e3c0db3 --- /dev/null +++ b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/ggml.c @@ -0,0 +1,8354 @@ +#include "ggml.h" + +#if defined(_MSC_VER) || defined(__MINGW32__) +#include // using malloc.h with MSC/MINGW +#elif !defined(__FreeBSD__) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +// if C99 - static_assert is nop +#ifndef static_assert +#define static_assert(cond, msg) +#endif + +#if defined _MSC_VER || defined(__MINGW32__) + +#if !defined(__MINGW32__) +#include +#else +// ref: https://github.com/ggerganov/whisper.cpp/issues/168 +#include +#include +#endif + +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; + +static void atomic_store(atomic_int* ptr, LONG val) { + InterlockedExchange(ptr, val); +} +static LONG atomic_load(atomic_int* ptr) { + return InterlockedCompareExchange(ptr, 0, 0); +} +static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { + return InterlockedExchangeAdd(ptr, inc); +} +static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { + return atomic_fetch_add(ptr, -(dec)); +} + +typedef HANDLE pthread_t; + +typedef DWORD thread_ret_t; +static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { + HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); + if (handle == NULL) + { + return EAGAIN; + } + + *out = handle; + return 0; +} + +static int pthread_join(pthread_t thread, void* unused) { + return (int) WaitForSingleObject(thread, INFINITE); +} + +static int sched_yield (void) { + Sleep (0); + return 0; +} +#else +#include +#include + +typedef void* thread_ret_t; +#endif + +#ifdef __HAIKU__ +#define static_assert(cond, msg) _Static_assert(cond, msg) +#endif + +#define GGML_DEBUG 0 +#define GGML_GELU_FP16 + +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#else + #define GGML_MEM_ALIGN 16 +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define UNUSED(x) (void)(x) +#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) + +#define GGML_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +#ifdef GGML_USE_ACCELERATE +#include +#elif GGML_USE_OPENBLAS +#include +#endif + +// floating point type used to accumulate sums +typedef double ggml_float; + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +#ifdef __ARM_NEON + +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include + +float ggml_fp16_to_fp32(ggml_fp16_t x) { + return x; +} + +ggml_fp16_t ggml_fp32_to_fp16(float x) { + return x; +} + +#define GGML_FP16_TO_FP32(x) (x) +#define GGML_FP32_TO_FP16(x) (x) + +#else + +#ifdef __wasm_simd128__ +#include +#else +#include +#endif + +#ifdef __F16C__ +float ggml_fp16_to_fp32(ggml_fp16_t h) { + return _cvtsh_ss(h); +} +ggml_fp16_t ggml_fp32_to_fp16(float f) { + return _cvtss_sh(f, 0); +} + +#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0) + +#else + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +float ggml_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +ggml_fp16_t ggml_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x) +#define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x) + +#endif // __F16C__ + +#endif // __ARM_NEON + +// +// global data +// + +// precomputed gelu table for f16 (128 KB) +static ggml_fp16_t table_gelu_f16[1 << 16]; + +// precomputed exp table for f16 (128 KB) +static ggml_fp16_t table_exp_f16[1 << 16]; + +// +// timing +// + +#if defined(_MSC_VER) || defined(__MINGW32__) +static int64_t timer_freq; +void ggml_time_init(void) { + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + timer_freq = frequency.QuadPart; +} +int64_t ggml_time_ms(void) { + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + return (t.QuadPart * 1000) / timer_freq; +} +int64_t ggml_time_us(void) { + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + return (t.QuadPart * 1000000) / timer_freq; +} +#else +void ggml_time_init(void) {} +int64_t ggml_time_ms(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; +} + +int64_t ggml_time_us(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; +} +#endif + +int64_t ggml_cycles(void) { + return clock(); +} + +int64_t ggml_cycles_per_ms(void) { + return CLOCKS_PER_SEC/1000; +} + +#ifdef GGML_PERF +#define ggml_perf_time_ms() ggml_time_ms() +#define ggml_perf_time_us() ggml_time_us() +#define ggml_perf_cycles() ggml_cycles() +#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#else +#define ggml_perf_time_ms() 0 +#define ggml_perf_time_us() 0 +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 +#endif + +// +// cache line +// + +#if defined(__cpp_lib_hardware_interference_size) +#define CACHE_LINE_SIZE hardware_destructive_interference_size +#else +#define CACHE_LINE_SIZE 64 +#endif + +const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); + +// +// fundamental operations +// + +inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } + +inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { + ggml_float sumf = 0.0; +#ifdef __ARM_NEON + // NEON 128-bit + const int n16 = (n & ~15); + + float32x4_t sum0 = vdupq_n_f32(0); + float32x4_t sum1 = vdupq_n_f32(0); + float32x4_t sum2 = vdupq_n_f32(0); + float32x4_t sum3 = vdupq_n_f32(0); + + float32x4_t x0, x1, x2, x3; + float32x4_t y0, y1, y2, y3; + + for (int i = 0; i < n16; i += 16) { + x0 = vld1q_f32(x + i + 0); + x1 = vld1q_f32(x + i + 4); + x2 = vld1q_f32(x + i + 8); + x3 = vld1q_f32(x + i + 12); + + y0 = vld1q_f32(y + i + 0); + y1 = vld1q_f32(y + i + 4); + y2 = vld1q_f32(y + i + 8); + y3 = vld1q_f32(y + i + 12); + + sum0 = vfmaq_f32(sum0, x0, y0); + sum1 = vfmaq_f32(sum1, x1, y1); + sum2 = vfmaq_f32(sum2, x2, y2); + sum3 = vfmaq_f32(sum3, x3, y3); + } + + // reduce sum0..sum3 to sum0 + sum0 = vaddq_f32(sum0, sum1); + sum2 = vaddq_f32(sum2, sum3); + sum0 = vaddq_f32(sum0, sum2); + + float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0)); + sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1); + + // leftovers + for (int i = n16; i < n; ++i) { + sumf += x[i]*y[i]; + } +#elif defined(__AVX2__) + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + sum0 = _mm256_fmadd_ps(x0, y0, sum0); + sum1 = _mm256_fmadd_ps(x1, y1, sum1); + sum2 = _mm256_fmadd_ps(x2, y2, sum2); + sum3 = _mm256_fmadd_ps(x3, y3, sum3); + } + + sum0 = _mm256_add_ps(sum0, sum1); + sum2 = _mm256_add_ps(sum2, sum3); + sum0 = _mm256_add_ps(sum0, sum2); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + sumf += x[i]*y[i]; + } +#elif defined(__AVX__) + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3); + } + + sum0 = _mm256_add_ps(sum0, sum1); + sum2 = _mm256_add_ps(sum2, sum3); + sum0 = _mm256_add_ps(sum0, sum2); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0), _mm256_extractf128_ps(sum0, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + sumf += x[i]*y[i]; + } +#elif defined(__wasm_simd128__) + // WASM 128-bit + const int n16 = (n & ~15); + + v128_t sum0 = wasm_f32x4_splat(0); + v128_t sum1 = wasm_f32x4_splat(0); + v128_t sum2 = wasm_f32x4_splat(0); + v128_t sum3 = wasm_f32x4_splat(0); + + v128_t x0, x1, x2, x3; + v128_t y0, y1, y2, y3; + + for (int i = 0; i < n16; i += 16) { + x0 = wasm_v128_load(x + i + 0); + x1 = wasm_v128_load(x + i + 4); + x2 = wasm_v128_load(x + i + 8); + x3 = wasm_v128_load(x + i + 12); + + y0 = wasm_v128_load(y + i + 0); + y1 = wasm_v128_load(y + i + 4); + y2 = wasm_v128_load(y + i + 8); + y3 = wasm_v128_load(y + i + 12); + + sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0)); + sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1)); + sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2)); + sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3)); + } + + sum0 = wasm_f32x4_add(sum0, sum1); + sum2 = wasm_f32x4_add(sum2, sum3); + sum0 = wasm_f32x4_add(sum0, sum2); + + sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3); + + // leftovers + for (int i = n16; i < n; ++i) { + sumf += x[i]*y[i]; + } +#else + // scalar + for (int i = 0; i < n; ++i) { + sumf += x[i]*y[i]; + } +#endif + + *s = sumf; +} + +inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { + ggml_float sumf = 0.0; +#ifdef __ARM_NEON + const int n32 = (n & ~31); + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + float16x8_t sum0 = vdupq_n_f16(0); + float16x8_t sum1 = vdupq_n_f16(0); + float16x8_t sum2 = vdupq_n_f16(0); + float16x8_t sum3 = vdupq_n_f16(0); + + float16x8_t x0, x1, x2, x3; + float16x8_t y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = vld1q_f16(x + i + 0 ); + x1 = vld1q_f16(x + i + 8 ); + x2 = vld1q_f16(x + i + 16); + x3 = vld1q_f16(x + i + 24); + + y0 = vld1q_f16(y + i + 0 ); + y1 = vld1q_f16(y + i + 8 ); + y2 = vld1q_f16(y + i + 16); + y3 = vld1q_f16(y + i + 24); + + sum0 = vfmaq_f16(sum0, x0, y0); + sum1 = vfmaq_f16(sum1, x1, y1); + sum2 = vfmaq_f16(sum2, x2, y2); + sum3 = vfmaq_f16(sum3, x3, y3); + } + + // reduce sum0..sum3 to sum0 + sum0 = vaddq_f16(sum0, sum1); + sum2 = vaddq_f16(sum2, sum3); + sum0 = vaddq_f16(sum0, sum2); + + // load sum0 into 2 float32x4_t + float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0)); + float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0)); + + // reduce sum0f32 and sum1f32 to sumf + sum0f32 = vaddq_f32(sum0f32, sum1f32); + + float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32)); + sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1); +#else + float32x4_t sum0 = vdupq_n_f32(0); + float32x4_t sum1 = vdupq_n_f32(0); + float32x4_t sum2 = vdupq_n_f32(0); + float32x4_t sum3 = vdupq_n_f32(0); + float32x4_t sum4 = vdupq_n_f32(0); + float32x4_t sum5 = vdupq_n_f32(0); + float32x4_t sum6 = vdupq_n_f32(0); + float32x4_t sum7 = vdupq_n_f32(0); + + float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; + float32x4_t y0, y1, y2, y3, y4, y5, y6, y7; + + for (int i = 0; i < n32; i += 32) { + x0 = vcvt_f32_f16(vld1_f16(x + i + 0 )); + x1 = vcvt_f32_f16(vld1_f16(x + i + 4 )); + x2 = vcvt_f32_f16(vld1_f16(x + i + 8 )); + x3 = vcvt_f32_f16(vld1_f16(x + i + 12)); + x4 = vcvt_f32_f16(vld1_f16(x + i + 16)); + x5 = vcvt_f32_f16(vld1_f16(x + i + 20)); + x6 = vcvt_f32_f16(vld1_f16(x + i + 24)); + x7 = vcvt_f32_f16(vld1_f16(x + i + 28)); + + y0 = vcvt_f32_f16(vld1_f16(y + i + 0 )); + y1 = vcvt_f32_f16(vld1_f16(y + i + 4 )); + y2 = vcvt_f32_f16(vld1_f16(y + i + 8 )); + y3 = vcvt_f32_f16(vld1_f16(y + i + 12)); + y4 = vcvt_f32_f16(vld1_f16(y + i + 16)); + y5 = vcvt_f32_f16(vld1_f16(y + i + 20)); + y6 = vcvt_f32_f16(vld1_f16(y + i + 24)); + y7 = vcvt_f32_f16(vld1_f16(y + i + 28)); + + sum0 = vfmaq_f32(sum0, x0, y0); + sum1 = vfmaq_f32(sum1, x1, y1); + sum2 = vfmaq_f32(sum2, x2, y2); + sum3 = vfmaq_f32(sum3, x3, y3); + sum4 = vfmaq_f32(sum4, x4, y4); + sum5 = vfmaq_f32(sum5, x5, y5); + sum6 = vfmaq_f32(sum6, x6, y6); + sum7 = vfmaq_f32(sum7, x7, y7); + } + + // reduce sum0..sum7 to sum0 + sum0 = vaddq_f32(sum0, sum1); + sum2 = vaddq_f32(sum2, sum3); + sum4 = vaddq_f32(sum4, sum5); + sum6 = vaddq_f32(sum6, sum7); + sum0 = vaddq_f32(sum0, sum2); + sum4 = vaddq_f32(sum4, sum6); + sum0 = vaddq_f32(sum0, sum4); + + // reduce sum0 to sumf + float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0)); + sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1); +#endif + + // leftovers + for (int i = n32; i < n; ++i) { + sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); + } +#elif defined(__AVX2__) + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + sum0 = _mm256_fmadd_ps(x0, y0, sum0); + sum1 = _mm256_fmadd_ps(x1, y1, sum1); + sum2 = _mm256_fmadd_ps(x2, y2, sum2); + sum3 = _mm256_fmadd_ps(x3, y3, sum3); + } + + const __m256 sum01 = _mm256_add_ps(sum0, sum1); + const __m256 sum23 = _mm256_add_ps(sum2, sum3); + const __m256 sum0123 = _mm256_add_ps(sum01, sum23); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + //GGML_ASSERT(false); + sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); + } +#elif defined(__AVX__) + // AVX 256-bit + const int n32 = (n & ~31); + + __m256 sum0 = _mm256_setzero_ps(); + __m256 sum1 = _mm256_setzero_ps(); + __m256 sum2 = _mm256_setzero_ps(); + __m256 sum3 = _mm256_setzero_ps(); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + sum0 = _mm256_add_ps(_mm256_mul_ps(x0, y0), sum0); + sum1 = _mm256_add_ps(_mm256_mul_ps(x1, y1), sum1); + sum2 = _mm256_add_ps(_mm256_mul_ps(x2, y2), sum2); + sum3 = _mm256_add_ps(_mm256_mul_ps(x3, y3), sum3); + } + + const __m256 sum01 = _mm256_add_ps(sum0, sum1); + const __m256 sum23 = _mm256_add_ps(sum2, sum3); + const __m256 sum0123 = _mm256_add_ps(sum01, sum23); + + const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(sum0123), _mm256_extractf128_ps(sum0123, 1)); + const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2)); + + sumf = _mm_cvtss_f32(r1); + + // leftovers + for (int i = n32; i < n; ++i) { + //GGML_ASSERT(false); + sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); + } +#elif defined(__wasm_simd128__) + // WASM 128-bit + const int n16 = (n & ~15); + + v128_t sum0 = wasm_f32x4_splat(0.0f); + v128_t sum1 = wasm_f32x4_splat(0.0f); + v128_t sum2 = wasm_f32x4_splat(0.0f); + v128_t sum3 = wasm_f32x4_splat(0.0f); + + v128_t x0, x1, x2, x3; + v128_t y0, y1, y2, y3; + + float tx[16]; + float ty[16]; + + for (int i = 0; i < n16; i += 16) { + for (int k = 0; k < 16; ++k) { + tx[k] = GGML_FP16_TO_FP32(x[i + k]); + ty[k] = GGML_FP16_TO_FP32(y[i + k]); + } + + x0 = wasm_v128_load(tx + 0); + x1 = wasm_v128_load(tx + 4); + x2 = wasm_v128_load(tx + 8); + x3 = wasm_v128_load(tx + 12); + + y0 = wasm_v128_load(ty + 0); + y1 = wasm_v128_load(ty + 4); + y2 = wasm_v128_load(ty + 8); + y3 = wasm_v128_load(ty + 12); + + sum0 = wasm_f32x4_add(sum0, wasm_f32x4_mul(x0, y0)); + sum1 = wasm_f32x4_add(sum1, wasm_f32x4_mul(x1, y1)); + sum2 = wasm_f32x4_add(sum2, wasm_f32x4_mul(x2, y2)); + sum3 = wasm_f32x4_add(sum3, wasm_f32x4_mul(x3, y3)); + } + + sum0 = wasm_f32x4_add(sum0, sum1); + sum2 = wasm_f32x4_add(sum2, sum3); + sum0 = wasm_f32x4_add(sum0, sum2); + + sumf = wasm_f32x4_extract_lane(sum0, 0) + wasm_f32x4_extract_lane(sum0, 1) + wasm_f32x4_extract_lane(sum0, 2) + wasm_f32x4_extract_lane(sum0, 3); + + // leftovers + for (int i = n16; i < n; ++i) { + //GGML_ASSERT(false); + sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); + } +#else + for (int i = 0; i < n; ++i) { + sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); + } +#endif + + *s = sumf; +} + +inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +#ifdef __ARM_NEON + // NEON 128-bit + const int n16 = (n & ~15); + + const float32x4_t v4 = vdupq_n_f32(v); + + float32x4_t x0, x1, x2, x3; + float32x4_t y0, y1, y2, y3; + + for (int i = 0; i < n16; i += 16) { + x0 = vld1q_f32(x + i + 0); + x1 = vld1q_f32(x + i + 4); + x2 = vld1q_f32(x + i + 8); + x3 = vld1q_f32(x + i + 12); + + y0 = vld1q_f32(y + i + 0); + y1 = vld1q_f32(y + i + 4); + y2 = vld1q_f32(y + i + 8); + y3 = vld1q_f32(y + i + 12); + + y0 = vfmaq_f32(y0, x0, v4); + y1 = vfmaq_f32(y1, x1, v4); + y2 = vfmaq_f32(y2, x2, v4); + y3 = vfmaq_f32(y3, x3, v4); + + vst1q_f32(y + i + 0, y0); + vst1q_f32(y + i + 4, y1); + vst1q_f32(y + i + 8, y2); + vst1q_f32(y + i + 12, y3); + } + + // leftovers + for (int i = n16; i < n; ++i) { + y[i] += x[i]*v; + } +#elif defined(__AVX2__) + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v4 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + y0 = _mm256_fmadd_ps(x0, v4, y0); + y1 = _mm256_fmadd_ps(x1, v4, y1); + y2 = _mm256_fmadd_ps(x2, v4, y2); + y3 = _mm256_fmadd_ps(x3, v4, y3); + + _mm256_storeu_ps(y + i + 0, y0); + _mm256_storeu_ps(y + i + 8, y1); + _mm256_storeu_ps(y + i + 16, y2); + _mm256_storeu_ps(y + i + 24, y3); + } + + // leftovers + for (int i = n32; i < n; ++i) { + y[i] += x[i]*v; + } +#elif defined(__AVX__) + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v4 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + x0 = _mm256_loadu_ps(x + i + 0); + x1 = _mm256_loadu_ps(x + i + 8); + x2 = _mm256_loadu_ps(x + i + 16); + x3 = _mm256_loadu_ps(x + i + 24); + + y0 = _mm256_loadu_ps(y + i + 0); + y1 = _mm256_loadu_ps(y + i + 8); + y2 = _mm256_loadu_ps(y + i + 16); + y3 = _mm256_loadu_ps(y + i + 24); + + y0 = _mm256_add_ps(_mm256_mul_ps(x0, v4), y0); + y1 = _mm256_add_ps(_mm256_mul_ps(x1, v4), y1); + y2 = _mm256_add_ps(_mm256_mul_ps(x2, v4), y2); + y3 = _mm256_add_ps(_mm256_mul_ps(x3, v4), y3); + + _mm256_storeu_ps(y + i + 0, y0); + _mm256_storeu_ps(y + i + 8, y1); + _mm256_storeu_ps(y + i + 16, y2); + _mm256_storeu_ps(y + i + 24, y3); + } + + // leftovers + for (int i = n32; i < n; ++i) { + y[i] += x[i]*v; + } +#elif defined(__wasm_simd128__) + // WASM SIMD 128-bit + const int n16 = (n & ~15); + + const v128_t v4 = wasm_f32x4_splat(v); + + v128_t x0, x1, x2, x3; + v128_t y0, y1, y2, y3; + + for (int i = 0; i < n16; i += 16) { + x0 = wasm_v128_load(x + i + 0); + x1 = wasm_v128_load(x + i + 4); + x2 = wasm_v128_load(x + i + 8); + x3 = wasm_v128_load(x + i + 12); + + y0 = wasm_v128_load(y + i + 0); + y1 = wasm_v128_load(y + i + 4); + y2 = wasm_v128_load(y + i + 8); + y3 = wasm_v128_load(y + i + 12); + + y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4)); + y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4)); + y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4)); + y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4)); + + wasm_v128_store(y + i + 0, y0); + wasm_v128_store(y + i + 4, y1); + wasm_v128_store(y + i + 8, y2); + wasm_v128_store(y + i + 12, y3); + } + + // leftovers + for (int i = n16; i < n; ++i) { + y[i] += x[i]*v; + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] += x[i]*v; + } +#endif +} + +inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) { +#ifdef __ARM_NEON + // NEON 128-bit + const int n32 = (n & ~31); + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + const float16x8_t v8 = vdupq_n_f16(v); + + float16x8_t x0, x1, x2, x3; + float16x8_t y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + y0 = vld1q_f16(y + i + 0 ); + y1 = vld1q_f16(y + i + 8 ); + y2 = vld1q_f16(y + i + 16); + y3 = vld1q_f16(y + i + 24); + + x0 = vld1q_f16(x + i + 0 ); + x1 = vld1q_f16(x + i + 8 ); + x2 = vld1q_f16(x + i + 16); + x3 = vld1q_f16(x + i + 24); + + y0 = vfmaq_f16(y0, x0, v8); + y1 = vfmaq_f16(y1, x1, v8); + y2 = vfmaq_f16(y2, x2, v8); + y3 = vfmaq_f16(y3, x3, v8); + + vst1q_f16(y + i + 0 , y0); + vst1q_f16(y + i + 8 , y1); + vst1q_f16(y + i + 16, y2); + vst1q_f16(y + i + 24, y3); + } +#else + const float32x4_t v40 = vdupq_n_f32(v); + const float32x4_t v41 = vdupq_n_f32(v); + + float32x4_t x0, x1, x2, x3, x4, x5, x6, x7; + float32x4_t y0, y1, y2, y3, y4, y5, y6, y7; + + for (int i = 0; i < n32; i += 32) { + y0 = vcvt_f32_f16(vld1_f16(y + i + 0 )); + y1 = vcvt_f32_f16(vld1_f16(y + i + 4 )); + y2 = vcvt_f32_f16(vld1_f16(y + i + 8 )); + y3 = vcvt_f32_f16(vld1_f16(y + i + 12)); + y4 = vcvt_f32_f16(vld1_f16(y + i + 16)); + y5 = vcvt_f32_f16(vld1_f16(y + i + 20)); + y6 = vcvt_f32_f16(vld1_f16(y + i + 24)); + y7 = vcvt_f32_f16(vld1_f16(y + i + 28)); + + x0 = vcvt_f32_f16(vld1_f16(x + i + 0 )); + x1 = vcvt_f32_f16(vld1_f16(x + i + 4 )); + x2 = vcvt_f32_f16(vld1_f16(x + i + 8 )); + x3 = vcvt_f32_f16(vld1_f16(x + i + 12)); + x4 = vcvt_f32_f16(vld1_f16(x + i + 16)); + x5 = vcvt_f32_f16(vld1_f16(x + i + 20)); + x6 = vcvt_f32_f16(vld1_f16(x + i + 24)); + x7 = vcvt_f32_f16(vld1_f16(x + i + 28)); + + y0 = vfmaq_f32(y0, x0, v40); + y1 = vfmaq_f32(y1, x1, v40); + y2 = vfmaq_f32(y2, x2, v40); + y3 = vfmaq_f32(y3, x3, v40); + y4 = vfmaq_f32(y4, x4, v41); + y5 = vfmaq_f32(y5, x5, v41); + y6 = vfmaq_f32(y6, x6, v41); + y7 = vfmaq_f32(y7, x7, v41); + + vst1_f16(y + i + 0 , vcvt_f16_f32(y0)); + vst1_f16(y + i + 4 , vcvt_f16_f32(y1)); + vst1_f16(y + i + 8 , vcvt_f16_f32(y2)); + vst1_f16(y + i + 12, vcvt_f16_f32(y3)); + vst1_f16(y + i + 16, vcvt_f16_f32(y4)); + vst1_f16(y + i + 20, vcvt_f16_f32(y5)); + vst1_f16(y + i + 24, vcvt_f16_f32(y6)); + vst1_f16(y + i + 28, vcvt_f16_f32(y7)); + } +#endif + + // leftovers + for (int i = n32; i < n; ++i) { + GGML_ASSERT(false); + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#elif defined(__AVX2__) + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v8 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_fmadd_ps(x0, v8, y0); + y1 = _mm256_fmadd_ps(x1, v8, y1); + y2 = _mm256_fmadd_ps(x2, v8, y2); + y3 = _mm256_fmadd_ps(x3, v8, y3); + + _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0)); + _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0)); + _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0)); + _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0)); + } + + // leftovers + for (int i = n32; i < n; ++i) { + GGML_ASSERT(false); + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#elif defined(__AVX__) + // AVX 256-bit + const int n32 = (n & ~31); + + const __m256 v8 = _mm256_set1_ps(v); + + __m256 x0, x1, x2, x3; + __m256 y0, y1, y2, y3; + + for (int i = 0; i < n32; i += 32) { + y0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 0 ))); + y1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 8 ))); + y2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 16))); + y3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(y + i + 24))); + + x0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 0 ))); + x1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 8 ))); + x2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 16))); + x3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(x + i + 24))); + + y0 = _mm256_add_ps(_mm256_mul_ps(x0, v8), y0); + y1 = _mm256_add_ps(_mm256_mul_ps(x1, v8), y1); + y2 = _mm256_add_ps(_mm256_mul_ps(x2, v8), y2); + y3 = _mm256_add_ps(_mm256_mul_ps(x3, v8), y3); + + _mm_storeu_si128((__m128i*)(y + i + 0 ), _mm256_cvtps_ph(y0, 0)); + _mm_storeu_si128((__m128i*)(y + i + 8 ), _mm256_cvtps_ph(y1, 0)); + _mm_storeu_si128((__m128i*)(y + i + 16), _mm256_cvtps_ph(y2, 0)); + _mm_storeu_si128((__m128i*)(y + i + 24), _mm256_cvtps_ph(y3, 0)); + } + + // leftovers + for (int i = n32; i < n; ++i) { + GGML_ASSERT(false); + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#elif defined(__wasm_simd128__) + // WASM SIMD 128-bit + const int n16 = (n & ~15); + + const v128_t v4 = wasm_f32x4_splat(v); + + v128_t x0, x1, x2, x3; + v128_t y0, y1, y2, y3; + + float tx[16]; + float ty[16]; + + for (int i = 0; i < n16; i += 16) { + for (int k = 0; k < 16; ++k) { + tx[k] = GGML_FP16_TO_FP32(x[i + k]); + ty[k] = GGML_FP16_TO_FP32(y[i + k]); + } + + x0 = wasm_v128_load(tx + 0); + x1 = wasm_v128_load(tx + 4); + x2 = wasm_v128_load(tx + 8); + x3 = wasm_v128_load(tx + 12); + + y0 = wasm_v128_load(ty + 0); + y1 = wasm_v128_load(ty + 4); + y2 = wasm_v128_load(ty + 8); + y3 = wasm_v128_load(ty + 12); + + y0 = wasm_f32x4_add(y0, wasm_f32x4_mul(x0, v4)); + y1 = wasm_f32x4_add(y1, wasm_f32x4_mul(x1, v4)); + y2 = wasm_f32x4_add(y2, wasm_f32x4_mul(x2, v4)); + y3 = wasm_f32x4_add(y3, wasm_f32x4_mul(x3, v4)); + + wasm_v128_store(ty + 0, y0); + wasm_v128_store(ty + 4, y1); + wasm_v128_store(ty + 8, y2); + wasm_v128_store(ty + 12, y3); + + for (int k = 0; k < 16; ++k) { + y[i + k] = GGML_FP32_TO_FP16(ty[k]); + } + } + + // leftovers + for (int i = n16; i < n; ++i) { + GGML_ASSERT(false); + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#else + for (int i = 0; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#endif +} + +inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrt(*s); } +inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } +inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrt(x[i]); } +inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } +inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } +inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } + +const ggml_float GELU_COEF_A = 0.044715; +const ggml_float SQRT_2_OVER_PI = 0.79788456080286535587989211986876; + +inline static float ggml_gelu_f32(float x) { + return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x))); +} + +inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { + const uint16_t * i16 = (const uint16_t *) x; + for (int i = 0; i < n; ++i) { + y[i] = table_gelu_f16[i16[i]]; + } +} + +#ifdef GGML_GELU_FP16 +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { + uint16_t t; + for (int i = 0; i < n; ++i) { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]); + } +} +#else +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_f32(x[i]); + } +} +#endif + +inline static void ggml_vec_sum_f32 (const int n, float * s, const float * x) { ggml_float sum = 0.0; for (int i = 0; i < n; ++i) sum += x[i]; *s += sum; } +inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); } + +// +// logging +// + +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG(...) +#endif + +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_5(...) +#endif + +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#else +#define GGML_PRINT_DEBUG_10(...) +#endif + +#define GGML_PRINT(...) printf(__VA_ARGS__) + +// +// data types +// + +const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { + sizeof(int8_t ), + sizeof(int16_t), + sizeof(int32_t), + sizeof(ggml_fp16_t), + sizeof(float ), +}; + +const char * GGML_OP_LABEL[GGML_OP_COUNT] = { + "NONE", + + "DUP", + "ADD", + "SUB", + "MUL", + "DIV", + "SQR", + "SQRT", + "SUM", + "MEAN", + "REPEAT", + "ABS", + "SGN", + "NEG", + "STEP", + "RELU", + "GELU", + "NORM", + + "MUL_MAT", + + "SCALE", + "CPY", + "RESHAPE", + "VIEW", + "PERMUTE", + "TRANSPOSE", + "GET_ROWS", + "DIAG_MASK_INF", + "SOFT_MAX", + "ROPE", + "CONV_1D_1S", + "CONV_1D_2S", + + "FLASH_ATTN", + "FLASH_FF", +}; + +const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { + "none", + + "x", + "x+y", + "x-y", + "x*y", + "x/y", + "x^2", + "√x", + "Σx", + "Σx/n", + "repeat(x)", + "abs(x)", + "sgn(x)", + "-x", + "step(x)", + "relu(x)", + "gelu(x)", + "norm(x)", + + "X*Y", + + "x*v", + "x-\\>y", + "reshape(x)", + "view(x)", + "permute(x)", + "transpose(x)", + "get_rows(x)", + "diag_mask_inf(x)", + "soft_max(x)", + "rope(x)", + "conv_1d_1s(x)", + "conv_1d_2s(x)", + + "flash_attn(x)", + "flash_ff(x)", +}; + +// +// ggml object +// + +struct ggml_object { + size_t offset; + size_t size; + + struct ggml_object * next; + + char padding[8]; +}; + +const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + +static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); +static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); + +// +// ggml context +// + +struct ggml_context { + size_t mem_size; + void * mem_buffer; + bool mem_buffer_owned; + + int n_objects; + + struct ggml_object * objects_begin; + struct ggml_object * objects_end; +}; + +struct ggml_context_container { + bool used; + + struct ggml_context context; +}; + +// +// compute types +// + +enum ggml_task_type { + GGML_TASK_INIT = 0, + GGML_TASK_COMPUTE, + GGML_TASK_FINALIZE, +}; + +struct ggml_compute_params { + enum ggml_task_type type; + + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; +}; + +// +// ggml state +// + +struct ggml_state { + struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; +}; + +// global state +struct ggml_state g_state; +atomic_int g_state_barrier = 0; + +//////////////////////////////////////////////////////////////////////////////// + +void ggml_print_object(const struct ggml_object * obj) { + GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n", + obj->offset, obj->size, (const void *) obj->next); +} + +void ggml_print_objects(const struct ggml_context * ctx) { + struct ggml_object * obj = ctx->objects_begin; + + GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); + + while (obj != NULL) { + ggml_print_object(obj); + obj = obj->next; + } + + GGML_PRINT("%s: --- end ---\n", __func__); +} + +int ggml_nelements(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +int ggml_nrows(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; +} + +size_t ggml_nbytes(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type]; +} + +size_t ggml_type_size(enum ggml_type type) { + return GGML_TYPE_SIZE[type]; +} + +size_t ggml_element_size(const struct ggml_tensor * tensor) { + return GGML_TYPE_SIZE[tensor->type]; +} + +bool ggml_is_scalar(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; +} + +bool ggml_is_vector(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; +} + +bool ggml_is_matrix(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->ne[2] == 1 && tensor->ne[3] == 1; +} + +bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t0->ne[0] == t1->ne[0]) && + (t0->ne[2] == t1->ne[2]) && + (t0->ne[3] == t1->ne[3]); +} + +bool ggml_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && + tensor->nb[1] == tensor->nb[0]*tensor->ne[0] && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2];; +} + +bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t0->ne[0] == t1->ne[0] ) && + (t0->ne[1] == t1->ne[1] ) && + (t0->ne[2] == t1->ne[2] ) && + (t0->ne[3] == t1->ne[3] ); +} + +// check if t1 can be represented as a repeatition of t0 +bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t1->ne[0]%t0->ne[0] == 0) && + (t1->ne[1]%t0->ne[1] == 0) && + (t1->ne[2]%t0->ne[2] == 0) && + (t1->ne[3]%t0->ne[3] == 0); +} + +int ggml_up32(int n) { + return (n + 31) & ~31; +} + +int ggml_up64(int n) { + return (n + 63) & ~63; +} + +// assert that pointer is aligned to GGML_MEM_ALIGN +#define ggml_assert_aligned(ptr) \ + assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) + +//////////////////////////////////////////////////////////////////////////////// + +struct ggml_context * ggml_init(struct ggml_init_params params) { + // make this function thread safe + { + int processing = atomic_fetch_add(&g_state_barrier, 1); + while (processing > 0) { + // wait for other threads to finish + atomic_fetch_sub(&g_state_barrier, 1); + sched_yield(); + processing = atomic_fetch_add(&g_state_barrier, 1); + } + } + + static bool is_first_call = true; + if (is_first_call) { + const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + + ggml_fp16_t ii; + for (int i = 0; i < (1 << 16); ++i) { + uint16_t ui = i; + memcpy(&ii, &ui, sizeof(ii)); + const float f = GGML_FP16_TO_FP32(ii); + table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); + table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f)); + } + + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + + GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + + is_first_call = false; + } + + // find non-used context in g_state + struct ggml_context * ctx = NULL; + + static bool first_time = true; + if (first_time) { + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { + g_state.contexts[i].used = false; + } + first_time = false; + } + + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { + if (!g_state.contexts[i].used) { + g_state.contexts[i].used = true; + ctx = &g_state.contexts[i].context; + + GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); + break; + } + } + + if (ctx == NULL) { + GGML_PRINT_DEBUG("%s: no unused context found\n", __func__); + + atomic_fetch_sub(&g_state_barrier, 1); + + return NULL; + } + + *ctx = (struct ggml_context) { + .mem_size = params.mem_size, + .mem_buffer = params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), + .mem_buffer_owned = params.mem_buffer ? false : true, + .n_objects = 0, + .objects_begin = NULL, + .objects_end = NULL, + }; + + ggml_assert_aligned(ctx->mem_buffer); + + GGML_PRINT_DEBUG("%s: context initialized\n", __func__); + + atomic_fetch_sub(&g_state_barrier, 1); + + return ctx; +} + +void ggml_free(struct ggml_context * ctx) { + // make this function thread safe + { + int processing = atomic_fetch_add(&g_state_barrier, 1); + while (processing > 0) { + // wait for other threads to finish + atomic_fetch_sub(&g_state_barrier, 1); + sched_yield(); + processing = atomic_fetch_add(&g_state_barrier, 1); + } + } + + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { + if (&g_state.contexts[i].context == ctx) { + g_state.contexts[i].used = false; + + GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", + __func__, i, ctx->n_objects, ctx->objects_end->offset + ctx->objects_end->size); + + if (ctx->mem_buffer_owned) { + free(ctx->mem_buffer); + } + + atomic_fetch_sub(&g_state_barrier, 1); + + return; + } + } + + GGML_PRINT_DEBUG("%s: context not found\n", __func__); + + atomic_fetch_sub(&g_state_barrier, 1); +} + +size_t ggml_used_mem(const struct ggml_context * ctx) { + return ctx->objects_end->offset + ctx->objects_end->size; +} + +//////////////////////////////////////////////////////////////////////////////// + +struct ggml_tensor * ggml_new_tensor_impl( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int* ne, + void* data) { + // always insert objects at the end of the context's memory pool + struct ggml_object * obj_cur = ctx->objects_end; + + const size_t cur_offset = obj_cur == NULL ? 0 : obj_cur->offset; + const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; + const size_t cur_end = cur_offset + cur_size; + + size_t size_needed = 0; + + if (data == NULL) { + size_needed += GGML_TYPE_SIZE[type]; + for (int i = 0; i < n_dims; i++) { + size_needed *= ne[i]; + } + // align to GGML_MEM_ALIGN + size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN; + + } + size_needed += sizeof(struct ggml_tensor); + + if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { + GGML_PRINT("%s: not enough space in the context's memory pool\n", __func__); + assert(false); + return NULL; + } + + char * const mem_buffer = ctx->mem_buffer; + + struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); + + *obj_new = (struct ggml_object) { + .offset = cur_end + GGML_OBJECT_SIZE, + .size = size_needed, + .next = NULL, + }; + + if (obj_cur != NULL) { + obj_cur->next = obj_new; + } else { + // this is the first object in this context + ctx->objects_begin = obj_new; + } + + ctx->objects_end = obj_new; + + //GGML_PRINT_DEBUG("%s: inserted new object at %zu\n", __func__, cur_end); + + struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offset); + + ggml_assert_aligned(result); + + *result = (struct ggml_tensor) { + /*.type =*/ type, + /*.n_dims =*/ n_dims, + /*.ne =*/ { 1, 1, 1, 1 }, + /*.nb =*/ { 0, 0, 0, 0 }, + /*.op =*/ GGML_OP_NONE, + /*.is_param =*/ false, + /*.grad =*/ NULL, + /*.src0 =*/ NULL, + /*.src1 =*/ NULL, + /*.opt =*/ { NULL }, + /*.n_tasks =*/ 0, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + /*.data =*/ data == NULL ? (void *)(result + 1) : data, + /*.pad =*/ { 0 }, + }; + + ggml_assert_aligned(result->data); + + for (int i = 0; i < n_dims; i++) { + result->ne[i] = ne[i]; + } + + result->nb[0] = GGML_TYPE_SIZE[type]; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; + } + + ctx->n_objects++; + + return result; +} + +struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int* ne) { + return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); +} + +struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0) { + return ggml_new_tensor(ctx, type, 1, &ne0); +} + +struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1) { + const int ne[2] = { ne0, ne1 }; + return ggml_new_tensor(ctx, type, 2, ne); +} + +struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1, + int ne2) { + const int ne[3] = { ne0, ne1, ne2 }; + return ggml_new_tensor(ctx, type, 3, ne); +} + +struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1, + int ne2, + int ne3) { + const int ne[4] = { ne0, ne1, ne2, ne3 }; + return ggml_new_tensor(ctx, type, 4, ne); +} + +struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); + + ggml_set_i32(result, value); + + return result; +} + +struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + + ggml_set_f32(result, value); + + return result; +} + +struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { + return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); +} + +struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { + memset(tensor->data, 0, ggml_nbytes(tensor)); + return tensor; +} + +struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { + const int n = ggml_nrows(tensor); + const int nc = tensor->ne[0]; + const size_t n1 = tensor->nb[1]; + + char * const data = tensor->data; + + switch (tensor->type) { + case GGML_TYPE_I8: + { + assert(tensor->nb[0] == sizeof(int8_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I16: + { + assert(tensor->nb[0] == sizeof(int16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I32: + { + assert(tensor->nb[0] == sizeof(int32_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F32: + { + assert(tensor->nb[0] == sizeof(float)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + } + } break; + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } + + return tensor; +} + +struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { + const int n = ggml_nrows(tensor); + const int nc = tensor->ne[0]; + const size_t n1 = tensor->nb[1]; + + char * const data = tensor->data; + + switch (tensor->type) { + case GGML_TYPE_I8: + { + assert(tensor->nb[0] == sizeof(int8_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I16: + { + assert(tensor->nb[0] == sizeof(int16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_I32: + { + assert(tensor->nb[0] == sizeof(int32_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + } + } break; + case GGML_TYPE_F32: + { + assert(tensor->nb[0] == sizeof(float)); + for (int i = 0; i < n; i++) { + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + } + } break; + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } + + return tensor; +} + +int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + return ((int8_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + return ((int16_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + return ((int32_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + return ((float *)(tensor->data))[i]; + } break; + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } + + return 0.0f; +} + +void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + ((int8_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + ((int16_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + ((int32_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + ((float *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + return ((int8_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + return ((int16_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + return ((int32_t *)(tensor->data))[i]; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + return ((float *)(tensor->data))[i]; + } break; + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } + + return 0.0f; +} + +void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { + switch (tensor->type) { + case GGML_TYPE_I8: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + ((int8_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + ((int16_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_I32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + ((int32_t *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(tensor->nb[0] == sizeof(float)); + ((float *)(tensor->data))[i] = value; + } break; + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +void * ggml_get_data(const struct ggml_tensor * tensor) { + return tensor->data; +} + +float * ggml_get_data_f32(const struct ggml_tensor * tensor) { + assert(tensor->type == GGML_TYPE_F32); + return (float *)(tensor->data); +} + +struct ggml_tensor * ggml_view_tensor( + struct ggml_context * ctx, + const struct ggml_tensor * src) { + return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); +} + +//////////////////////////////////////////////////////////////////////////////// + +// ggml_dup + +struct ggml_tensor * ggml_dup_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_DUP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_dup_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_dup_impl(ctx, a, true); +} + +// ggml_add + +struct ggml_tensor * ggml_add_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + assert(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ADD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add_impl(ctx, a, b, true); +} + +// ggml_sub + +struct ggml_tensor * ggml_sub_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + assert(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SUB; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_sub_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_sub_impl(ctx, a, b, true); +} + +// ggml_mul + +struct ggml_tensor * ggml_mul_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + assert(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + if (inplace) { + assert(is_node == false); + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_MUL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_mul_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_mul_impl(ctx, a, b, true); +} + +// ggml_div + +struct ggml_tensor * ggml_div_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + assert(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + if (inplace) { + assert(is_node == false); + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_DIV; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_div_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_div_impl(ctx, a, b, true); +} + +// ggml_sqr + +struct ggml_tensor * ggml_sqr_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SQR; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqr_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqr_impl(ctx, a, true); +} + +// ggml_sqrt + +struct ggml_tensor * ggml_sqrt_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SQRT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqrt_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqrt_impl(ctx, a, true); +} + +// ggml_sum + +struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); + + result->op = GGML_OP_SUM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +// ggml_mean + +struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement + is_node = true; + } + + int ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne); + + result->op = GGML_OP_MEAN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +// ggml_repeat + +struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + assert(ggml_can_repeat(a, b)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + if (ggml_are_same_shape(a, b) && !is_node) { + return a; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + + result->op = GGML_OP_REPEAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +// ggml_abs + +struct ggml_tensor * ggml_abs_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_ABS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_abs_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_abs_impl(ctx, a, true); +} + + +// ggml_sgn + +struct ggml_tensor * ggml_sgn_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SGN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sgn_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sgn_impl(ctx, a, true); +} + +// ggml_neg + +struct ggml_tensor * ggml_neg_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_NEG; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_neg_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_neg_impl(ctx, a, true); +} + +// ggml_step + +struct ggml_tensor * ggml_step_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_STEP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_step_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_step_impl(ctx, a, true); +} + +// ggml_relu + +struct ggml_tensor * ggml_relu_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_RELU; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_relu_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_relu_impl(ctx, a, true); +} + +// ggml_gelu + +struct ggml_tensor * ggml_gelu_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_GELU; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_gelu_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_gelu_impl(ctx, a, true); +} + +// ggml_norm + +struct ggml_tensor * ggml_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + assert(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_NORM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; // TODO: maybe store epsilon here? + + return result; +} + +struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_norm_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_norm_impl(ctx, a, true); +} + +// ggml_mul_mat + +struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + assert(ggml_can_mul_mat(a, b)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_MUL_MAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_scale + +struct ggml_tensor * ggml_scale_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + assert(ggml_is_scalar(b)); + assert(ggml_is_padded_1d(a)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + assert(false); // TODO: implement backward + is_node = true; + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + result->op = GGML_OP_SCALE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_scale_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_scale_impl(ctx, a, b, true); +} + +// ggml_cpy + +struct ggml_tensor * ggml_cpy_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + assert(ggml_nelements(a) == ggml_nelements(b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + assert(false); // TODO: implement backward + is_node = true; + } + + // make a view of the destination + struct ggml_tensor * result = ggml_view_tensor(ctx, b); + + result->op = GGML_OP_CPY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_cpy_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_cpy_impl(ctx, a, b, true); +} + +// ggml_reshape + +struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + assert(ggml_is_contiguous(a)); + assert(ggml_is_contiguous(b)); + assert(ggml_nelements(a) == ggml_nelements(b)); + + bool is_node = false; + + if (a->grad || b->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1) { + assert(ggml_is_contiguous(a)); + assert(ggml_nelements(a) == ne0*ne1); + + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + const int ne[2] = { ne0, ne1 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + int ne2) { + assert(ggml_is_contiguous(a)); + assert(ggml_nelements(a) == ne0*ne1*ne2); + + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + const int ne[3] = { ne0, ne1, ne2 }; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +// ggml_view_1d + +struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + size_t offset) { + if (a->grad) { + assert(false); // gradient propagation is not supported + } + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + + result->op = GGML_OP_VIEW; + result->grad = NULL; + result->src0 = a; + result->src1 = NULL; // TODO: maybe store the offset here? + + return result; +} + +// ggml_view_2d + +struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + size_t nb1, + size_t offset) { + if (a->grad) { + assert(false); // gradient propagation is not supported + } + + const int ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + + result->nb[1] = nb1; + result->nb[2] = result->nb[1]*ne1; + result->nb[3] = result->nb[2]; + + result->op = GGML_OP_VIEW; + result->grad = NULL; + result->src0 = a; + result->src1 = NULL; // TODO: maybe store the offset here? + + return result; +} + +// ggml_permute + +struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3) { + assert(axis0 >= 0 && axis0 < GGML_MAX_DIMS); + assert(axis1 >= 0 && axis1 < GGML_MAX_DIMS); + assert(axis2 >= 0 && axis2 < GGML_MAX_DIMS); + assert(axis3 >= 0 && axis3 < GGML_MAX_DIMS); + + assert(axis0 != axis1); + assert(axis0 != axis2); + assert(axis0 != axis3); + assert(axis1 != axis2); + assert(axis1 != axis3); + assert(axis2 != axis3); + + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + int ne[GGML_MAX_DIMS]; + int nb[GGML_MAX_DIMS]; + + ne[axis0] = a->ne[0]; + ne[axis1] = a->ne[1]; + ne[axis2] = a->ne[2]; + ne[axis3] = a->ne[3]; + + nb[axis0] = a->nb[0]; + nb[axis1] = a->nb[1]; + nb[axis2] = a->nb[2]; + nb[axis3] = a->nb[3]; + + result->ne[0] = ne[0]; + result->ne[1] = ne[1]; + result->ne[2] = ne[2]; + result->ne[3] = ne[3]; + + result->nb[0] = nb[0]; + result->nb[1] = nb[1]; + result->nb[2] = nb[2]; + result->nb[3] = nb[3]; + + result->op = GGML_OP_PERMUTE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; // TODO: maybe store the permutation here? + + return result; +} + +// ggml_transpose + +struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + result->ne[0] = a->ne[1]; + result->ne[1] = a->ne[0]; + + result->nb[0] = a->nb[1]; + result->nb[1] = a->nb[0]; + + result->op = GGML_OP_TRANSPOSE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +// ggml_get_rows + +struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + assert(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); + + bool is_node = false; + + if (a->grad || b->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + // TODO: implement non F32 return + //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]); + + result->op = GGML_OP_GET_ROWS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_diag_mask_inf + +struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past) { + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); + ((int32_t *) b->data)[0] = n_past; + + result->op = GGML_OP_DIAG_MASK_INF; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_soft_max + +struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a) { + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + result->op = GGML_OP_SOFT_MAX; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +// ggml_rope + +struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode) { + assert(n_past >= 0); + bool is_node = false; + + if (a->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + ((int32_t *) b->data)[0] = n_past; + ((int32_t *) b->data)[1] = n_dims; + ((int32_t *) b->data)[2] = mode; + + result->op = GGML_OP_ROPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_conv_1d_1s + +struct ggml_tensor * ggml_conv_1d_1s( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + assert(ggml_is_matrix(b)); + assert(a->ne[1] == b->ne[1]); + assert(a->ne[3] == 1); + bool is_node = false; + + if (a->grad || b->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + const int ne[4] = { b->ne[0], a->ne[2], 1, 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + + result->op = GGML_OP_CONV_1D_1S; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_conv_1d_2s + +struct ggml_tensor * ggml_conv_1d_2s( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + assert(ggml_is_matrix(b)); + assert(a->ne[1] == b->ne[1]); + assert(a->ne[3] == 1); + bool is_node = false; + + if (a->grad || b->grad) { + assert(false); // TODO: implement backward + is_node = true; + } + + const int ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + + result->op = GGML_OP_CONV_1D_2S; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_flash_attn + +struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked) { + assert(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne); + + result->op = GGML_OP_FLASH_ATTN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = q; + result->src1 = k; + result->opt[0] = v; + result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0); + + return result; +} + +// ggml_flash_ff + +struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1) { + assert(ggml_can_mul_mat(b0, a)); + // TODO: more checks + + bool is_node = false; + + if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne); + + result->op = GGML_OP_FLASH_FF; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b0; + result->opt[0] = b1; + result->opt[1] = c0; + result->opt[2] = c1; + + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor) { + tensor->is_param = true; + + assert(tensor->grad == NULL); + tensor->grad = ggml_dup_tensor(ctx, tensor); +} + +// ggml_compute_forward_dup + +void ggml_compute_forward_dup_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_is_contiguous(dst)); + assert(ggml_nelements(dst) == ggml_nelements(src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + //const int ne00 = src0->ne[0]; + //const int ne01 = src0->ne[1]; + //const int ne02 = src0->ne[2]; + //const int ne03 = src0->ne[3]; + + //const size_t nb00 = src0->nb[0]; + //const size_t nb01 = src0->nb[1]; + //const size_t nb02 = src0->nb[2]; + //const size_t nb03 = src0->nb[3]; + + if (ggml_is_contiguous(src0) && src0->type == dst->type) { + memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); + return; + } + + GGML_ASSERT(false); // TODO: implement +} + +void ggml_compute_forward_dup_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + if (ggml_is_contiguous(src0) && src0->type == dst->type) { + memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); + return; + } + + if (src0->nb[0] == sizeof(float)) { + if (dst->type == GGML_TYPE_F32) { + int id = 0; + const size_t rs = ne00*nb00; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_ptr = (char *) dst->data + id*rs; + + memcpy(dst_ptr, src0_ptr, rs); + + id++; + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + int id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + int id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + int id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } +} + +void ggml_compute_forward_dup( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_dup_f16(params, src0, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_dup_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_add + +void ggml_compute_forward_add_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb10 = src1->nb[0]; + const size_t nb11 = src1->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (nb10 == sizeof(float)) { + const int j0 = (n/nth)*ith; + const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1); + + for (int j = j0; j < j1; j++) { + ggml_vec_add_f32(nc, + (float *) ((char *) dst->data + j*nb1), + (float *) ((char *) src0->data + j*nb01), + (float *) ((char *) src1->data + j*nb11)); + } + } else { + // src1 is not contiguous + for (int j = ith; j < n; j += nth) { + float * dst_ptr = (float *) ((char *) dst->data + j*nb1); + float * src0_ptr = (float *) ((char *) src0->data + j*nb01); + for (int i = 0; i < nc; i++) { + float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + + dst_ptr[i] = src0_ptr[i] + *src1_ptr; + } + } + } +} + +void ggml_compute_forward_add( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_add_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_sub + +void ggml_compute_forward_sub_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sub_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + +void ggml_compute_forward_sub( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sub_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_mul + +void ggml_compute_forward_mul_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_mul_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + +void ggml_compute_forward_mul( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_mul_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_div + +void ggml_compute_forward_div_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_div_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + +void ggml_compute_forward_div( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_div_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_sqr + +void ggml_compute_forward_sqr_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sqr_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +void ggml_compute_forward_sqr( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sqr_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_sqrt + +void ggml_compute_forward_sqrt_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sqrt_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +void ggml_compute_forward_sqrt( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sqrt_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_sum + +void ggml_compute_forward_sum_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(ggml_is_scalar(dst)); + assert(src0->nb[0] == sizeof(float)); + + *(float *) (dst->data) = 0.0f; + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f32(ne00, + (float *) (dst->data), + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + } + } + } +} + +void ggml_compute_forward_sum( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sum_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_mean + +void ggml_compute_forward_mean_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(float)); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; + + assert(ne0 == 1); + assert(ne1 == ne01); + assert(ne2 == ne02); + assert(ne3 == ne03); + + UNUSED(ne0); + UNUSED(ne1); + UNUSED(ne2); + UNUSED(ne3); + + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) = 0.0f; + + ggml_vec_sum_f32(ne00, + (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + + *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; + } + } + } +} + +void ggml_compute_forward_mean( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_mean_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_repeat + +void ggml_compute_forward_repeat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_can_repeat(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: implement support for rank > 2 tensors + assert(src0->ne[2] == 1); + assert(src0->ne[3] == 1); + assert( dst->ne[2] == 1); + assert( dst->ne[3] == 1); + + const int nc = dst->ne[0]; + const int nr = dst->ne[1]; + const int nc0 = src0->ne[0]; + const int nr0 = src0->ne[1]; + const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat + const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat + + // TODO: support for transposed / permuted tensors + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + // TODO: maybe this is not optimal? + for (int i = 0; i < nrr; i++) { + for (int j = 0; j < ncr; j++) { + for (int k = 0; k < nr0; k++) { + ggml_vec_cpy_f32(nc0, + (float *) ((char *) dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])), + (float *) ((char *) src0->data + ( k)*(src0->nb[1]))); + } + } + } +} + +void ggml_compute_forward_repeat( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_abs + +void ggml_compute_forward_abs_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_abs_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +void ggml_compute_forward_abs( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_abs_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_sgn + +void ggml_compute_forward_sgn_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_sgn_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +void ggml_compute_forward_sgn( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_sgn_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_neg + +void ggml_compute_forward_neg_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_neg_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +void ggml_compute_forward_neg( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_neg_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_step + +void ggml_compute_forward_step_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_step_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +void ggml_compute_forward_step( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_step_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_relu + +void ggml_compute_forward_relu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert(dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + ggml_vec_relu_f32(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + +void ggml_compute_forward_relu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_relu_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_gelu + +void ggml_compute_forward_gelu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +void ggml_compute_forward_gelu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gelu_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_norm + +void ggml_compute_forward_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const ggml_float eps = 1e-5f; // TODO: make this a parameter + + // TODO: optimize + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = ith; i01 < ne01; i01 += nth) { + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + ggml_float mean = 0.0; + for (int i00 = 0; i00 < ne00; i00++) { + mean += x[i00]; + } + + mean /= ne00; + + float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + ggml_float sum2 = 0.0; + for (int i00 = 0; i00 < ne00; i00++) { + ggml_float v = x[i00] - mean; + y[i00] = v; + sum2 += v*v; + } + + const float scale = 1.0/sqrt(sum2/ne00 + eps); + + ggml_vec_scale_f32(ne00, y, scale); + } + } + } +} + +void ggml_compute_forward_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_norm_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_mul_mat + +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +bool ggml_compute_forward_mul_mat_use_blas( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + UNUSED(src0); + + const int ne10 = src1->ne[0]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) { + //printf("BLAS: %d %d %d\n", ne0, ne1, ne10); + return true; + } + + return false; +} + +void ggml_compute_forward_mul_mat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; + const int ne = ne0*ne1*ne2*ne3; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + assert(ne02 == ne12); + assert(ne03 == ne13); + assert(ne2 == ne12); + assert(ne3 == ne13); + + // TODO: we don't support permuted src0 + assert(nb00 == sizeof(float) || nb01 == sizeof(float)); + + // dst cannot be transposed or permuted + assert(nb0 == sizeof(float)); + assert(nb0 <= nb1); + assert(nb1 <= nb2); + assert(nb2 <= nb3); + + assert(ne0 == ne01); + assert(ne1 == ne11); + assert(ne2 == ne02); + assert(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + // + // nb00 < nb01 - src0 is transposed + // compute by src0 columns + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->ith != 0) return; + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + const float * x = (float *) (src0->data); + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // zT = y * xT + { + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01); + } + } + } + + //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + + return; + } +#endif + + if (params->type == GGML_TASK_INIT) { + if (nb01 >= nb00) { + return; + } + + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + if (nb01 >= nb00) { + return; + } + + // TODO: fix this memset (wsize is overestimated) + //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth); + + float * const wdata = params->wdata; + + // cols per thread + const int dc = (ne + nth - 1)/nth; + + // col range for this thread + const int ic0 = dc*ith; + const int ic1 = MIN(ic0 + dc, ne); + + ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0); + + for (int k = 1; k < nth; k++) { + ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0); + } + + return; + } + + if (nb01 >= nb00) { + // TODO: do not support transposed src1 + assert(nb10 == sizeof(float)); + + // parallelize by src0 rows using ggml_vec_dot_f32 + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + for (int ic = 0; ic < ne11; ++ic) { + // src1 indices + const int i13 = i03; + const int i12 = i02; + const int i11 = ic; + + // dst indices + const int i0 = i01; + const int i1 = i11; + const int i2 = i02; + const int i3 = i03; + + ggml_vec_dot_f32(ne00, + (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), + (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); + } + } + } else { + // parallelize by src1 columns using ggml_vec_mad_f32 + // each thread has its own work data + // during FINALIZE we accumulate all work data into dst + + // total columns in src1 + const int nc = ne10; + + // columns per thread + const int dc = (nc + nth - 1)/nth; + + // column range for this thread + const int ic0 = dc*ith; + const int ic1 = MIN(ic0 + dc, nc); + + // work data for thread + const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; + float * const wdata = params->wdata; + + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + for (int i11 = 0; i11 < ne11; ++i11) { + for (int ic = ic0; ic < ic1; ++ic) { + // src1 indices + const int i10 = ic; + + // src0 indices + const int i03 = i13; + const int i02 = i12; + const int i00 = ic; + + // dst indices + const int i1 = i11; + const int i2 = i12; + const int i3 = i13; + + assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); + + ggml_vec_mad_f32(ne01, + (float *) (wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0), + (float *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)), + *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13))); + } + } + } + } + } + + //int64_t t1 = ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} +} + +void ggml_compute_forward_mul_mat_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; + const int ne = ne0*ne1*ne2*ne3; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // TODO: we don't support permuted src0 + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + // + // nb00 < nb01 - src0 is transposed + // compute by src0 columns + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->ith != 0) return; + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + float * const wdata = params->wdata; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + { + int id = 0; + for (int i01 = 0; i01 < ne01; ++i01) { + for (int i00 = 0; i00 < ne00; ++i00) { + wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); + } + } + } + + const float * x = wdata; + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + + // float * z = wdata + ne00*ne01; + + // z = x * yT + //{ + // cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + // ne01, ne11, ne00, + // 1.0f, x, ne00, + // y, ne00, + // 0.0f, z, ne11); + //} + + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // transpose z + //for (int j = 0; j < ne11; ++j) { + // for (int i = 0; i < ne01; ++i) { + // d[j*ne01 + i] = z[i*ne11 + j]; + // } + //} + + { +#if 1 + // zT = y * xT + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne00, + x, ne00, + 0.0f, d, ne01); +#else + // zT = (xT * y)T + cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans, + ne01, ne11, ne10, + 1.0f, x, ne00, + y, ne00, + 0.0f, d, ne01); +#endif + } + } + } + + //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + + return; + } +#endif + + if (params->type == GGML_TASK_INIT) { + if (nb01 >= nb00) { + ggml_fp16_t * const wdata = params->wdata; + + int id = 0; + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + for (int i11 = 0; i11 < ne11; ++i11) { + for (int i10 = 0; i10 < ne10; ++i10) { + wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); + } + } + } + } + + GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize); + + return; + } + + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + if (nb01 >= nb00) { + return; + } + + // TODO: fix this memset (wsize is overestimated) + //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth); + + ggml_fp16_t * const wdata = params->wdata; + + // cols per thread + const int dc = (ne + nth - 1)/nth; + + // col range for this thread + const int ic0 = dc*ith; + const int ic1 = MIN(ic0 + dc, ne); + + for (int i = ic0; i < ic1; ++i) { + ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]); + } + + for (int k = 1; k < nth; k++) { + for (int i = ic0; i < ic1; ++i) { + ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]); + } + } + + return; + } + + if (nb01 >= nb00) { + // fp16 -> half the size, so divide by 2 + // TODO: do not support transposed src1 + assert(nb10/2 == sizeof(ggml_fp16_t)); + + // parallelize by src0 rows using ggml_vec_dot_f32 + + // total rows in src0 + const int nr = ne01*ne02*ne03; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + ggml_fp16_t * wdata = params->wdata; + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 indices + const int i03 = ir/(ne02*ne01); + const int i02 = (ir - i03*ne02*ne01)/ne01; + const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int i13 = i03; + const int i12 = i02; + + const int i0 = i01; + const int i2 = i02; + const int i3 = i03; + + ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + ggml_fp16_t * src1_col = wdata + (i13*ne12*ne11 + i12*ne11 + 0)*ne00; + + float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + + for (int ic = 0; ic < ne11; ++ic) { + assert(ne00 % 32 == 0); + + ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); + } + } + } else { + // parallelize by src1 columns using ggml_vec_mad_f32 + // each thread has its own work data + // during FINALIZE we accumulate all work data into dst + + // total columns in src1 + const int nc = ne10; + + // columns per thread + const int dc = (nc + nth - 1)/nth; + + // column range for this thread + const int ic0 = dc*ith; + const int ic1 = MIN(ic0 + dc, nc); + + // work data for thread + const int wo = (ne + CACHE_LINE_SIZE_F32)*ith; + ggml_fp16_t * const wdata = params->wdata; + + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + for (int i11 = 0; i11 < ne11; ++i11) { + // dst indices + const int i1 = i11; + const int i2 = i12; + const int i3 = i13; + + ggml_fp16_t * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0; + + for (int ic = ic0; ic < ic1; ++ic) { + // src1 indices + const int i10 = ic; + + // src0 indices + const int i03 = i13; + const int i02 = i12; + const int i00 = ic; + + assert(sizeof(ggml_fp16_t)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize); + + ggml_fp16_t * src0_col = (ggml_fp16_t *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)); + float src1_val = * (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + + ggml_vec_mad_f16(ne01, dst_row, src0_col, src1_val); + } + } + } + } + } + + //int64_t t1 = ggml_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} +} + +void ggml_compute_forward_mul_mat( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_mul_mat_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_scale + +void ggml_compute_forward_scale_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // scale factor + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v); + } +} + +void ggml_compute_forward_scale( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_scale_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_cpy + +void ggml_compute_forward_cpy( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); +} + +// ggml_compute_forward_reshape + +void ggml_compute_forward_reshape( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + // NOP + UNUSED(params); + UNUSED(src0); + UNUSED(dst); +} + +// ggml_compute_forward_view + +void ggml_compute_forward_view( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { + // NOP + UNUSED(params); + UNUSED(src0); +} + +// ggml_compute_forward_permute + +void ggml_compute_forward_permute( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { + // NOP + UNUSED(params); + UNUSED(src0); +} + +// ggml_compute_forward_transpose + +void ggml_compute_forward_transpose( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { + // NOP + UNUSED(params); + UNUSED(src0); +} + +// ggml_compute_forward_get_rows + +void ggml_compute_forward_get_rows_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + assert( dst->ne[0] == nc); + assert( dst->ne[1] == nr); + assert(src0->nb[0] == sizeof(ggml_fp16_t)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + for (int j = 0; j < nc; ++j) { + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); + } + } +} + +void ggml_compute_forward_get_rows_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int nc = src0->ne[0]; + const int nr = ggml_nelements(src1); + + assert( dst->ne[0] == nc); + assert( dst->ne[1] == nr); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < nr; ++i) { + const int r = ((int32_t *) src1->data)[i]; + + ggml_vec_cpy_f32(nc, + (float *) ((char *) dst->data + i*dst->nb[1]), + (float *) ((char *) src0->data + r*src0->nb[1])); + } +} + +void ggml_compute_forward_get_rows( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_diag_mask_inf + +void ggml_compute_forward_diag_mask_inf_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 1); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n_past = ((int32_t *) src1->data)[0]; + + // TODO: handle transposed/permuted matrices + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + const int nr = src0->ne[1]; + const int nz = n/nr; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int k = 0; k < nz; k++) { + for (int j = 0; j < nr; j++) { + for (int i = n_past; i < nc; i++) { + if (i > n_past + j) { + *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY; + } + } + } + } +} + +void ggml_compute_forward_diag_mask_inf( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_soft_max + +void ggml_compute_forward_soft_max_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float *p = (float *)((char *) dst->data + i1*dst->nb[1]); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(p[i])); + } +#endif + + float max = -INFINITY; + for (int i = 0; i < nc; i++) { + max = MAX(max, p[i]); + } + + ggml_float sum = 0.0; + + uint16_t ss; + for (int i = 0; i < nc; i++) { + if (p[i] == -INFINITY) { + p[i] = 0.0; + } else { + //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max); + memcpy(&ss, &s, sizeof(ss)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); + sum += val; + p[i] = val; + } + } + + assert(sum > 0.0f); + + sum = 1.0/sum; + ggml_vec_scale_f32(nc, p, sum); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(p[i])); + assert(!isinf(p[i])); + } +#endif + } +} + +void ggml_compute_forward_soft_max( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_max_f32(params, src0, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_rope + +void ggml_compute_forward_rope_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 3); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n_past = ((int32_t *) src1->data)[0]; + const int n_dims = ((int32_t *) src1->data)[1]; + const int mode = ((int32_t *) src1->data)[2]; + + //const int ne0 = src0->ne[0]; + const int ne1 = src0->ne[1]; + const int ne2 = src0->ne[2]; + const int ne3 = src0->ne[3]; + + const int nb0 = src0->nb[0]; + const int nb1 = src0->nb[1]; + const int nb2 = src0->nb[2]; + const int nb3 = src0->nb[3]; + + //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); + //printf("n_past = %d, ne2 = %d\n", n_past, ne2); + + assert(nb0 == sizeof(float)); + + // TODO: optimize + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { + const int p = (mode == 0 ? n_past + i2 : i2); + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < n_dims; i0 += 2) { + const double theta = pow(10000.0, ((double)-i0)/n_dims); + + const double cos_theta = cos(p*theta); + const double sin_theta = sin(p*theta); + + const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + double x0 = src[0]; + double x1 = src[1]; + + dst_data[0] = x0*cos_theta - x1*sin_theta; + dst_data[1] = x0*sin_theta + x1*cos_theta; + } + } + } + } +} + +void ggml_compute_forward_rope( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rope_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_conv_1d_1s + +void ggml_compute_forward_conv_1d_1s_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + //const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + //const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; + + //const int ne0 = dst->ne[0]; + //const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + //const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + //const int nb12 = src1->nb[2]; + //const int nb13 = src1->nb[3]; + + //const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + //const int nb2 = dst->nb[2]; + //const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + for (int i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int i0 = 0; i0 < ne10; ++i0) { + dst_data[i0] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0] += v; + } + } + } +} + +void ggml_compute_forward_conv_1d_1s_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + //const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + //const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; + + //const int ne0 = dst->ne[0]; + //const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + //const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + //const int nb12 = src1->nb[2]; + //const int nb13 = src1->nb[3]; + + //const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + //const int nb2 = dst->nb[2]; + //const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + float * const wdata = (float *) params->wdata + 0; + + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i02*ew0*ne00; + for (int i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + ne02*ew0*ne00; + + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = src[i10]; + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int i0 = 0; i0 < ne10; ++i0) { + dst_data[i0] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f32(ew0, &v, + (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0] += v; + } + } + } +} + +void ggml_compute_forward_conv_1d_1s( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_conv_1d_2s + +void ggml_compute_forward_conv_1d_2s_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + //const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + //const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; + + //const int ne0 = dst->ne[0]; + //const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + //const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + //const int nb12 = src1->nb[2]; + //const int nb13 = src1->nb[3]; + + //const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + //const int nb2 = dst->nb[2]; + //const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + for (int i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + ggml_fp16_t * dst_data = wdata; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int i0 = 0; i0 < ne10; i0 += 2) { + dst_data[i0/2] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0/2] += v; + } + } + } +} + +void ggml_compute_forward_conv_1d_2s_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + //const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + //const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; + + //const int ne0 = dst->ne[0]; + //const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + //const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + //const int nb12 = src1->nb[2]; + //const int nb13 = src1->nb[3]; + + //const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + //const int nb2 = dst->nb[2]; + //const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk = ne00; + const int nh = nk/2; + + const int ew0 = ggml_up32(ne01); + + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare kernel data (src0) + { + float * const wdata = (float *) params->wdata + 0; + + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); + float * dst_data = wdata + i02*ew0*ne00; + for (int i00 = 0; i00 < ne00; i00++) { + dst_data[i00*ew0 + i01] = src[i00]; + } + } + } + } + + // prepare source data (src1) + { + float * const wdata = (float *) params->wdata + ne02*ew0*ne00; + + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i11*nb11); + float * dst_data = wdata; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[(i10 + nh)*ew0 + i11] = src[i10]; + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total rows in dst + const int nr = ne02; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * dst_data = (float *)((char *) dst->data + i1*nb1); + for (int i0 = 0; i0 < ne10; i0 += 2) { + dst_data[i0/2] = 0; + for (int k = -nh; k <= nh; k++) { + float v = 0.0f; + ggml_vec_dot_f32(ew0, &v, + (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + + dst_data[i0/2] += v; + } + } + } +} + +void ggml_compute_forward_conv_1d_2s( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_flash_attn + +void ggml_compute_forward_flash_attn_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int neq0 = q->ne[0]; + const int neq1 = q->ne[1]; + const int neq2 = q->ne[2]; + const int neq3 = q->ne[3]; + + const int nek0 = k->ne[0]; + const int nek1 = k->ne[1]; + //const int nek2 = k->ne[2]; + //const int nek3 = k->ne[3]; + + //const int nev0 = v->ne[0]; + const int nev1 = v->ne[1]; + //const int nev2 = v->ne[2]; + //const int nev3 = v->ne[3]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + + const int nbk0 = k->nb[0]; + const int nbk1 = k->nb[1]; + const int nbk2 = k->nb[2]; + const int nbk3 = k->nb[3]; + + const int nbq0 = q->nb[0]; + const int nbq1 = q->nb[1]; + const int nbq2 = q->nb[2]; + const int nbq3 = q->nb[3]; + + const int nbv0 = v->nb[0]; + const int nbv1 = v->nb[1]; + const int nbv2 = v->nb[2]; + const int nbv3 = v->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int D = neq0; + const int N = neq1; + const int P = nek1 - N; + const int M = P + N; + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0/sqrt((double) D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + float * S = (float *) params->wdata + ith*(M + CACHE_LINE_SIZE_F32); + + for (int ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + for (int i = 0; i < M; i++) { + max = MAX(max, S[i]); + } + + ggml_float sum = 0.0; + + uint16_t ss; + for (int i = 0; i < M; i++) { + if (S[i] == -INFINITY) { + S[i] = 0.0; + } else { + //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max); + memcpy(&ss, &s, sizeof(ss)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); + sum += val; + S[i] = val; + } + } + + assert(sum > 0.0f); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, S, sum); + } + + for (int ic = 0; ic < nev1; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_dot_f32(nek1, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + S); + } + } +} + +void ggml_compute_forward_flash_attn_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int neq0 = q->ne[0]; + const int neq1 = q->ne[1]; + const int neq2 = q->ne[2]; + const int neq3 = q->ne[3]; + + const int nek0 = k->ne[0]; + const int nek1 = k->ne[1]; + //const int nek2 = k->ne[2]; + //const int nek3 = k->ne[3]; + + //const int nev0 = v->ne[0]; + const int nev1 = v->ne[1]; + //const int nev2 = v->ne[2]; + //const int nev3 = v->ne[3]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + //const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + + const int nbk0 = k->nb[0]; + const int nbk1 = k->nb[1]; + const int nbk2 = k->nb[2]; + const int nbk3 = k->nb[3]; + + const int nbq0 = q->nb[0]; + const int nbq1 = q->nb[1]; + const int nbq2 = q->nb[2]; + const int nbq3 = q->nb[3]; + + const int nbv0 = v->nb[0]; + const int nbv1 = v->nb[1]; + const int nbv2 = v->nb[2]; + const int nbv3 = v->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int D = neq0; + const int N = neq1; + const int P = nek1 - N; + const int M = P + N; + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0/sqrt((double) D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); + + for (int ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f16(neq0, + S + i1, + (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + for (int i = 0; i < M; i++) { + max = MAX(max, S[i]); + } + + ggml_float sum = 0.0; + + uint16_t ss; + for (int i = 0; i < M; i++) { + if (S[i] == -INFINITY) { + S[i] = 0.0; + } else { + //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max); + memcpy(&ss, &s, sizeof(ss)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]); + sum += val; + S[i] = val; + } + } + + assert(sum > 0.0f); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, S, sum); + } + + ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); + + for (int i = 0; i < M; i++) { + S16[i] = GGML_FP32_TO_FP16(S[i]); + } + + for (int ic = 0; ic < nev1; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_dot_f16(nek1, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + S16); + } + } +} + +void ggml_compute_forward_flash_attn( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const bool masked, + struct ggml_tensor * dst) { + switch (q->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +// ggml_compute_forward_flash_ff + +void ggml_compute_forward_flash_ff_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, // F16 + const struct ggml_tensor * b0, // F16 fc_w + const struct ggml_tensor * b1, // F32 fc_b + const struct ggml_tensor * c0, // F16 proj_w + const struct ggml_tensor * c1, // F32 proj_b + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int nea0 = a->ne[0]; + const int nea1 = a->ne[1]; + const int nea2 = a->ne[2]; + const int nea3 = a->ne[3]; + + const int neb00 = b0->ne[0]; + const int neb01 = b0->ne[1]; + //const int neb02 = b0->ne[2]; + //const int neb03 = b0->ne[3]; + + const int neb10 = b1->ne[0]; + const int neb11 = b1->ne[1]; + //const int neb12 = b1->ne[2]; + //const int neb13 = b1->ne[3]; + + const int nec00 = c0->ne[0]; + const int nec01 = c0->ne[1]; + //const int nec02 = c0->ne[2]; + //const int nec03 = c0->ne[3]; + + const int nec10 = c1->ne[0]; + const int nec11 = c1->ne[1]; + //const int nec12 = c1->ne[2]; + //const int nec13 = c1->ne[3]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + + const int nba0 = a->nb[0]; + const int nba1 = a->nb[1]; + const int nba2 = a->nb[2]; + const int nba3 = a->nb[3]; + + const int nbb00 = b0->nb[0]; + const int nbb01 = b0->nb[1]; + const int nbb02 = b0->nb[2]; + const int nbb03 = b0->nb[3]; + + const int nbb10 = b1->nb[0]; + //const int nbb11 = b1->nb[1]; + //const int nbb12 = b1->nb[2]; + //const int nbb13 = b1->nb[3]; + + const int nbc00 = c0->nb[0]; + const int nbc01 = c0->nb[1]; + const int nbc02 = c0->nb[2]; + const int nbc03 = c0->nb[3]; + + const int nbc10 = c1->nb[0]; + //const int nbc11 = c1->nb[1]; + //const int nbc12 = c1->nb[2]; + //const int nbc13 = c1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int D = nea0; + //const int N = nea1; + const int M = neb01; + + GGML_ASSERT(ne0 == nea0); + GGML_ASSERT(ne1 == nea1); + GGML_ASSERT(ne2 == nea2); + + GGML_ASSERT(nba0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbb10 == sizeof(float)); + GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbc10 == sizeof(float)); + + GGML_ASSERT(neb00 == D); + GGML_ASSERT(neb01 == M); + GGML_ASSERT(neb10 == M); + GGML_ASSERT(neb11 == 1); + + GGML_ASSERT(nec00 == M); + GGML_ASSERT(nec01 == D); + GGML_ASSERT(nec10 == D); + GGML_ASSERT(nec11 == 1); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by a rows using ggml_vec_dot_f32 + + // total rows in a + const int nr = nea1*nea2*nea3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // a indices + const int ia3 = ir/(nea2*nea1); + const int ia2 = (ir - ia3*nea2*nea1)/nea1; + const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1); + + float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32); + + for (int ic = 0; ic < neb01; ++ic) { + // b0 indices + const int ib03 = ia3; + const int ib02 = ia2; + const int ib01 = ic; + + // S indices + const int i1 = ib01; + + ggml_vec_dot_f16(nea0, + S + i1, + (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), + (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); + } + + ggml_vec_add_f32(neb01, S, S, (float *) b1->data); + //ggml_vec_gelu_f32(neb01, S, S); + + ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); + + for (int i = 0; i < M; i++) { + S16[i] = GGML_FP32_TO_FP16(S[i]); + } + + ggml_vec_gelu_f16(neb01, S16, S16); + + { + // dst indices + const int i1 = ia1; + const int i2 = ia2; + const int i3 = ia3; + + for (int ic = 0; ic < nec01; ++ic) { + + ggml_vec_dot_f16(neb01, + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), + (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), + S16); + } + + ggml_vec_add_f32(nec01, + (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), + (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), + (float *) c1->data); + } + } +} + +void ggml_compute_forward_flash_ff( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b0, + const struct ggml_tensor * b1, + const struct ggml_tensor * c0, + const struct ggml_tensor * c1, + struct ggml_tensor * dst) { + switch (b0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); // TODO + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + assert(false); + } break; + } +} + +///////////////////////////////// + +void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + assert(params); + + switch (tensor->op) { + case GGML_OP_DUP: + { + ggml_compute_forward_dup(params, tensor->src0, tensor); + } break; + case GGML_OP_ADD: + { + ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_SUB: + { + ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_MUL: + { + ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_DIV: + { + ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_SQR: + { + ggml_compute_forward_sqr(params, tensor->src0, tensor); + } break; + case GGML_OP_SQRT: + { + ggml_compute_forward_sqrt(params, tensor->src0, tensor); + } break; + case GGML_OP_SUM: + { + ggml_compute_forward_sum(params, tensor->src0, tensor); + } break; + case GGML_OP_MEAN: + { + ggml_compute_forward_mean(params, tensor->src0, tensor); + } break; + case GGML_OP_REPEAT: + { + ggml_compute_forward_repeat(params, tensor->src0, tensor); + } break; + case GGML_OP_ABS: + { + ggml_compute_forward_abs(params, tensor->src0, tensor); + } break; + case GGML_OP_SGN: + { + ggml_compute_forward_sgn(params, tensor->src0, tensor); + } break; + case GGML_OP_NEG: + { + ggml_compute_forward_neg(params, tensor->src0, tensor); + } break; + case GGML_OP_STEP: + { + ggml_compute_forward_step(params, tensor->src0, tensor); + } break; + case GGML_OP_RELU: + { + ggml_compute_forward_relu(params, tensor->src0, tensor); + } break; + case GGML_OP_GELU: + { + ggml_compute_forward_gelu(params, tensor->src0, tensor); + } break; + case GGML_OP_NORM: + { + ggml_compute_forward_norm(params, tensor->src0, tensor); + } break; + case GGML_OP_MUL_MAT: + { + ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_SCALE: + { + ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_CPY: + { + ggml_compute_forward_cpy(params, tensor->src0, tensor); + } break; + case GGML_OP_RESHAPE: + { + ggml_compute_forward_reshape(params, tensor->src0, tensor); + } break; + case GGML_OP_VIEW: + { + ggml_compute_forward_view(params, tensor->src0); + } break; + case GGML_OP_PERMUTE: + { + ggml_compute_forward_permute(params, tensor->src0); + } break; + case GGML_OP_TRANSPOSE: + { + ggml_compute_forward_transpose(params, tensor->src0); + } break; + case GGML_OP_GET_ROWS: + { + ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_DIAG_MASK_INF: + { + ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_SOFT_MAX: + { + ggml_compute_forward_soft_max(params, tensor->src0, tensor); + } break; + case GGML_OP_ROPE: + { + ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_CONV_1D_1S: + { + ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_CONV_1D_2S: + { + ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_FLASH_ATTN: + { + int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); + } break; + case GGML_OP_FLASH_FF: + { + ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); + } break; + case GGML_OP_NONE: + { + // nop + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + }; +} + +//////////////////////////////////////////////////////////////////////////////// + +void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { + struct ggml_tensor * src0 = tensor->src0; + struct ggml_tensor * src1 = tensor->src1; + + switch (tensor->op) { + case GGML_OP_DUP: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_ADD: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_SUB: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + } + if (src1->grad) { + src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_MUL: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, src1, tensor->grad), + inplace); + } + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + ggml_mul(ctx, src0, tensor->grad), + inplace); + } + } break; + case GGML_OP_DIV: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_div(ctx, tensor->grad, src1), + inplace); + } + if (src1->grad) { + src1->grad = + ggml_sub_impl(ctx, + src1->grad, + ggml_mul(ctx, + tensor->grad, + ggml_div(ctx, tensor, src1)), + inplace); + } + } break; + case GGML_OP_SQR: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, + ggml_mul(ctx, src0, tensor->grad), + ggml_repeat(ctx, ggml_new_f32(ctx, 2.0f), src0)), + inplace); + } + } break; + case GGML_OP_SQRT: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_div(ctx, + ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor), + tensor), + inplace); + } + } break; + case GGML_OP_SUM: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_repeat(ctx, tensor->grad, src0->grad), + inplace); + } + } break; + case GGML_OP_MEAN: + { + assert(false); // TODO: implement + } break; + case GGML_OP_REPEAT: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_sum(ctx, tensor->grad), + inplace); + } + } break; + case GGML_OP_ABS: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, + ggml_sgn(ctx, src0), + tensor->grad), + inplace); + } + } break; + case GGML_OP_SGN: + { + if (src0->grad) { + // noop + } + } break; + case GGML_OP_NEG: + { + if (src0->grad) { + src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace); + } + } break; + case GGML_OP_STEP: + { + if (src0->grad) { + // noop + } + } break; + case GGML_OP_RELU: + { + if (src0->grad) { + src0->grad = ggml_sub_impl(ctx, + src0->grad, + ggml_mul(ctx, + ggml_step(ctx, src0), + tensor->grad), + inplace); + } + } break; + case GGML_OP_GELU: + { + assert(false); // TODO: not implemented + } break; + case GGML_OP_NORM: + { + assert(false); // TODO: not implemented + } break; + case GGML_OP_MUL_MAT: + { + if (src0->grad) { + // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad); + assert(false); + } + if (src1->grad) { + src1->grad = + ggml_add_impl(ctx, + src1->grad, + // TODO: fix transpose, the node will break the graph connections + ggml_mul_mat(ctx, ggml_transpose(ctx, src0), tensor->grad), + inplace); + } + } break; + case GGML_OP_SCALE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CPY: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_RESHAPE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_VIEW: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_PERMUTE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_TRANSPOSE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_GET_ROWS: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_DIAG_MASK_INF: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_SOFT_MAX: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_ROPE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D_1S: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D_2S: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_FLASH_ATTN: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_FLASH_FF: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_NONE: + { + // nop + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + }; +} + +void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { + if (node->grad == NULL) { + // this usually happens when we generate intermediate nodes from constants in the backward pass + // it can also happen during forward pass, if the user performs computations with constants + if (node->op != GGML_OP_NONE) { + //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op); + } + } + + // check if already visited + for (int i = 0; i < cgraph->n_nodes; i++) { + if (cgraph->nodes[i] == node) { + return; + } + } + + for (int i = 0; i < cgraph->n_leafs; i++) { + if (cgraph->leafs[i] == node) { + return; + } + } + + if (node->src0) { + ggml_visit_parents(cgraph, node->src0); + } + + if (node->src1) { + ggml_visit_parents(cgraph, node->src1); + } + + for (int i = 0; i < GGML_MAX_OPT; ++i) { + if (node->opt[i]) { + ggml_visit_parents(cgraph, node->opt[i]); + } + } + + if (node->op == GGML_OP_NONE && node->grad == NULL) { + // reached a leaf node, not part of the gradient graph (e.g. a constant) + assert(cgraph->n_leafs < GGML_MAX_NODES); + + cgraph->leafs[cgraph->n_leafs] = node; + cgraph->n_leafs++; + } else { + assert(cgraph->n_nodes < GGML_MAX_NODES); + + cgraph->nodes[cgraph->n_nodes] = node; + cgraph->grads[cgraph->n_nodes] = node->grad; + cgraph->n_nodes++; + } +} + +void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) { + if (!expand) { + cgraph->n_nodes = 0; + cgraph->n_leafs = 0; + } + + const int n0 = cgraph->n_nodes; + UNUSED(n0); + + ggml_visit_parents(cgraph, tensor); + + const int n_new = cgraph->n_nodes - n0; + GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); + + if (n_new > 0) { + // the last added node should always be starting point + assert(cgraph->nodes[cgraph->n_nodes - 1] == tensor); + } +} + +void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { + ggml_build_forward_impl(cgraph, tensor, true); +} + +struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { + struct ggml_cgraph result = { + /*.n_nodes =*/ 0, + /*.n_leafs =*/ 0, + /*.n_threads =*/ 0, + /*.work_size =*/ 0, + /*.work =*/ NULL, + /*.nodes =*/ { NULL }, + /*.grads =*/ { NULL }, + /*.leafs =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + }; + + ggml_build_forward_impl(&result, tensor, false); + + return result; +} + +struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { + struct ggml_cgraph result = *gf; + + assert(gf->n_nodes > 0); + + // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph + if (keep) { + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (node->grad) { + node->grad = ggml_dup_tensor(ctx, node); + gf->grads[i] = node->grad; + } + } + } + + for (int i = gf->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = gf->nodes[i]; + + // because we detached the grad nodes from the original graph, we can afford inplace operations + if (node->grad) { + ggml_compute_backward(ctx, node, keep); + } + } + + for (int i = gf->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = gf->nodes[i]; + + if (node->is_param) { + GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); + ggml_build_forward_impl(&result, node->grad, true); + } + } + + return result; +} + +// +// thread data +// +// synchronization is done via busy loops +// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops +// + +#ifdef __APPLE__ + +//#include + +//typedef os_unfair_lock ggml_lock_t; +// +//#define ggml_lock_init(x) UNUSED(x) +//#define ggml_lock_destroy(x) UNUSED(x) +//#define ggml_lock_lock os_unfair_lock_lock +//#define ggml_lock_unlock os_unfair_lock_unlock +// +//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT + +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#define ggml_lock_lock(x) UNUSED(x) +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +#else + +//typedef pthread_spinlock_t ggml_lock_t; + +//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) +//#define ggml_lock_destroy pthread_spin_destroy +//#define ggml_lock_lock pthread_spin_lock +//#define ggml_lock_unlock pthread_spin_unlock + +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#define ggml_lock_lock(x) UNUSED(x) +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +#endif + +struct ggml_compute_state_shared { + ggml_lock_t spin; + + int n_threads; + + // synchronization primitives + atomic_int n_ready; + atomic_bool has_work; + atomic_bool stop; // stop all threads +}; + +struct ggml_compute_state { + pthread_t thrd; + + struct ggml_compute_params params; + struct ggml_tensor * node; + + struct ggml_compute_state_shared * shared; +}; + +// function used by each compute thread +void * ggml_graph_compute_one(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + ggml_compute_forward(&state->params, state->node); + + return NULL; +} + +thread_ret_t ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + const int n_threads = state->shared->n_threads; + + while (true) { + if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { + atomic_store(&state->shared->has_work, false); + } else { + while (atomic_load(&state->shared->has_work)) { + if (atomic_load(&state->shared->stop)) { + return 0; + } + ggml_lock_lock (&state->shared->spin); + ggml_lock_unlock(&state->shared->spin); + } + } + + atomic_fetch_sub(&state->shared->n_ready, 1); + + // wait for work + while (!atomic_load(&state->shared->has_work)) { + if (atomic_load(&state->shared->stop)) { + return 0; + } + ggml_lock_lock (&state->shared->spin); + ggml_lock_unlock(&state->shared->spin); + } + + // check if we should stop + if (atomic_load(&state->shared->stop)) { + break; + } + + if (state->node) { + ggml_compute_forward(&state->params, state->node); + state->node = NULL; + } else { + break; + } + } + + return 0; +} + +void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { + if (cgraph->n_threads <= 0) { + cgraph->n_threads = 8; + } + + const int n_threads = cgraph->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.spin =*/ GGML_LOCK_INITIALIZER, + /*.n_threads =*/ n_threads, + /*.n_ready =*/ 0, + /*.has_work =*/ false, + /*.stop =*/ false, + }; + struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; + + // create thread pool + if (n_threads > 1) { + ggml_lock_init(&state_shared.spin); + + atomic_store(&state_shared.has_work, true); + + for (int j = 0; j < n_threads - 1; j++) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .params = { + .type = GGML_TASK_COMPUTE, + .ith = j + 1, + .nth = n_threads, + .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wdata = cgraph->work ? cgraph->work->data : NULL, + }, + .node = NULL, + .shared = &state_shared, + }; + int rc = pthread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + assert(rc == 0); + UNUSED(rc); + } + } + + // initialize tasks + work buffer + { + size_t work_size = 0; + + // thread scheduling for the different operations + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_DUP: + { + node->n_tasks = 1; + } break; + case GGML_OP_ADD: + { + node->n_tasks = n_threads; + } break; + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_SUM: + case GGML_OP_MEAN: + case GGML_OP_REPEAT: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_RELU: + { + node->n_tasks = 1; + } break; + case GGML_OP_GELU: + { + node->n_tasks = n_threads; + } break; + case GGML_OP_NORM: + { + node->n_tasks = n_threads; + } break; + case GGML_OP_MUL_MAT: + { + // TODO: use different scheduling for different matrix sizes + node->n_tasks = n_threads; + + size_t cur = 0; + + // TODO: better way to determine if the matrix is transposed + if (node->src0->nb[1] < node->src0->nb[0]) { + cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1) + } else { + if (node->src0->type == GGML_TYPE_F16 && + node->src1->type == GGML_TYPE_F32) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]); + } else { + cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1); + } +#else + cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1); +#endif + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = 0; + } else { + GGML_ASSERT(false); + } + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + node->n_tasks = n_threads; + } break; + case GGML_OP_CPY: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_DIAG_MASK_INF: + { + node->n_tasks = 1; + } break; + case GGML_OP_SOFT_MAX: + { + node->n_tasks = n_threads; + } break; + case GGML_OP_ROPE: + { + node->n_tasks = 1; + } break; + case GGML_OP_CONV_1D_1S: + case GGML_OP_CONV_1D_2S: + { + node->n_tasks = n_threads; + + GGML_ASSERT(node->src0->ne[3] == 1); + GGML_ASSERT(node->src1->ne[2] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src0->ne[0]; + + if (node->src0->type == GGML_TYPE_F16 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else { + GGML_ASSERT(false); + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN: + { + node->n_tasks = n_threads; + + size_t cur = 0; + + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + } + + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + node->n_tasks = n_threads; + + size_t cur = 0; + + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + } + + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_NONE: + { + node->n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + assert(false); + } break; + }; + } + + if (cgraph->work != NULL && work_size > cgraph->work_size) { + assert(false); // TODO: better handling + } + + if (work_size > 0 && cgraph->work == NULL) { + cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); + + GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); + cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); + } + } + + const int64_t perf_start_cycles = ggml_perf_cycles(); + const int64_t perf_start_time_us = ggml_perf_time_us(); + + for (int i = 0; i < cgraph->n_nodes; i++) { + GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); + + struct ggml_tensor * node = cgraph->nodes[i]; + + // TODO: this could be used to avoid unnecessary computations, but it needs to be improved + //if (node->grad == NULL && node->perf_runs > 0) { + // continue; + //} + + const int64_t perf_node_start_cycles = ggml_perf_cycles(); + const int64_t perf_node_start_time_us = ggml_perf_time_us(); + + // INIT + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_INIT, + /*.ith =*/ 0, + /*.nth =*/ node->n_tasks, + /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, + /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + }; + + ggml_compute_forward(¶ms, node); + + // COMPUTE + if (node->n_tasks > 1) { + if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { + atomic_store(&state_shared.has_work, false); + } + + while (atomic_load(&state_shared.has_work)) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + + // launch thread pool + for (int j = 0; j < n_threads - 1; j++) { + workers[j].params = (struct ggml_compute_params) { + .type = GGML_TASK_COMPUTE, + .ith = j + 1, + .nth = n_threads, + .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wdata = cgraph->work ? cgraph->work->data : NULL, + }; + workers[j].node = node; + } + + atomic_fetch_sub(&state_shared.n_ready, 1); + + while (atomic_load(&state_shared.n_ready) > 0) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + + atomic_store(&state_shared.has_work, true); + } + + params.type = GGML_TASK_COMPUTE; + ggml_compute_forward(¶ms, node); + + // wait for thread pool + if (node->n_tasks > 1) { + if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { + atomic_store(&state_shared.has_work, false); + } + + while (atomic_load(&state_shared.has_work)) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + + atomic_fetch_sub(&state_shared.n_ready, 1); + + while (atomic_load(&state_shared.n_ready) != 0) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + } + + // FINALIZE + if (node->n_tasks > 1) { + if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { + atomic_store(&state_shared.has_work, false); + } + + while (atomic_load(&state_shared.has_work)) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + + // launch thread pool + for (int j = 0; j < n_threads - 1; j++) { + workers[j].params = (struct ggml_compute_params) { + .type = GGML_TASK_FINALIZE, + .ith = j + 1, + .nth = n_threads, + .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wdata = cgraph->work ? cgraph->work->data : NULL, + }; + workers[j].node = node; + } + + atomic_fetch_sub(&state_shared.n_ready, 1); + + while (atomic_load(&state_shared.n_ready) > 0) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + + atomic_store(&state_shared.has_work, true); + } + + params.type = GGML_TASK_FINALIZE; + ggml_compute_forward(¶ms, node); + + // wait for thread pool + if (node->n_tasks > 1) { + if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { + atomic_store(&state_shared.has_work, false); + } + + while (atomic_load(&state_shared.has_work)) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + + atomic_fetch_sub(&state_shared.n_ready, 1); + + while (atomic_load(&state_shared.n_ready) != 0) { + ggml_lock_lock (&state_shared.spin); + ggml_lock_unlock(&state_shared.spin); + } + } + + // performance stats (node) + { + int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles; + int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us; + + node->perf_runs++; + node->perf_cycles += perf_cycles_cur; + node->perf_time_us += perf_time_us_cur; + } + } + + // join thread pool + if (n_threads > 1) { + atomic_store(&state_shared.stop, true); + atomic_store(&state_shared.has_work, true); + + for (int j = 0; j < n_threads - 1; j++) { + int rc = pthread_join(workers[j].thrd, NULL); + assert(rc == 0); + UNUSED(rc); + } + + ggml_lock_destroy(&state_shared.spin); + } + + // performance stats (graph) + { + int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; + int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us; + + cgraph->perf_runs++; + cgraph->perf_cycles += perf_cycles_cur; + cgraph->perf_time_us += perf_time_us_cur; + + GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", + __func__, cgraph->perf_runs, + (double) perf_cycles_cur / (double) ggml_cycles_per_ms(), + (double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs, + (double) perf_time_us_cur / 1000.0, + (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); + } +} + +void ggml_graph_reset(struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * grad = cgraph->grads[i]; + + if (grad) { + ggml_set_zero(grad); + } + } +} + +void ggml_graph_print(const struct ggml_cgraph * cgraph) { + int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; + + GGML_PRINT("=== GRAPH ===\n"); + + GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); + GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size); + + GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + perf_total_per_op_us[node->op] += node->perf_time_us; + + GGML_PRINT(" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", + i, + node->ne[0], node->ne[1], node->ne[2], + GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + (double) node->perf_cycles / (double) ggml_cycles_per_ms(), + (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, + (double) node->perf_time_us / 1000.0, + (double) node->perf_time_us / 1000.0 / node->perf_runs); + } + + GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * node = cgraph->leafs[i]; + + GGML_PRINT(" - %3d: [ %6d, %6d] %8s\n", + i, + node->ne[0], node->ne[1], + GGML_OP_LABEL[node->op]); + } + + for (int i = 0; i < GGML_OP_COUNT; i++) { + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); + } + + GGML_PRINT("========================================\n"); +} + +// check if node is part of the graph +bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { + if (cgraph == NULL) { + return true; + } + + for (int i = 0; i < cgraph->n_nodes; i++) { + if (cgraph->nodes[i] == node) { + return true; + } + } + + return false; +} + +struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * parent = cgraph->nodes[i]; + + if (parent->grad == node) { + return parent; + } + } + + return NULL; +} + +void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { + char color[16]; + + FILE * fp = fopen(filename, "w"); + assert(fp); + + fprintf(fp, "digraph G {\n"); + fprintf(fp, " newrank = true;\n"); + fprintf(fp, " rankdir = LR;\n"); + + for (int i = 0; i < gb->n_nodes; i++) { + struct ggml_tensor * node = gb->nodes[i]; + + if (ggml_graph_get_parent(gb, node) != NULL) { + continue; + } + + if (node->is_param) { + snprintf(color, sizeof(color), "yellow"); + } else if (node->grad) { + if (ggml_graph_find(gf, node)) { + snprintf(color, sizeof(color), "green"); + } else { + snprintf(color, sizeof(color), "lightblue"); + } + } else { + snprintf(color, sizeof(color), "white"); + } + + fprintf(fp, " \"%p\" [ \ +style = filled; fillcolor = %s; shape = record; \ +label=\"%d [%d, %d] | %s", + (void *) node, color, + i, node->ne[0], node->ne[1], + GGML_OP_SYMBOL[node->op]); + + if (node->grad) { + fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); + } else { + fprintf(fp, "\"; ]\n"); + } + } + + for (int i = 0; i < gb->n_leafs; i++) { + struct ggml_tensor * node = gb->leafs[i]; + + snprintf(color, sizeof(color), "pink"); + + if (ggml_nelements(node) == 1) { + fprintf(fp, " \"%p\" [ \ +style = filled; fillcolor = %s; shape = record; \ +label=\"%.1e\"; ]\n", + (void *) node, color, ggml_get_f32_1d(node, 0)); + } else { + fprintf(fp, " \"%p\" [ \ +style = filled; fillcolor = %s; shape = record; \ +label=\"CONST %d [%d, %d]\"; ]\n", + (void *) node, color, + i, node->ne[0], node->ne[1]); + } + } + + for (int i = 0; i < gb->n_nodes; i++) { + struct ggml_tensor * node = gb->nodes[i]; + + struct ggml_tensor * parent = ggml_graph_get_parent(gb, node); + + if (node->src0) { + struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0); + + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n", + parent0 ? (void *) parent0 : (void *) node->src0, + parent0 ? "g" : "x", + parent ? (void *) parent : (void *) node, + parent ? "g" : "x", + parent ? "empty" : "vee", + parent ? "dashed" : "solid"); + } + + if (node->src1) { + struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1); + + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", + parent1 ? (void *) parent1 : (void *) node->src1, + parent1 ? "g" : "x", + parent ? (void *) parent : (void *) node, + parent ? "g" : "x", + parent ? "empty" : "vee", + parent ? "dashed" : "solid"); + } + } + + for (int i = 0; i < gb->n_leafs; i++) { + struct ggml_tensor * node = gb->leafs[i]; + + if (node->src0) { + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", + (void *) node->src0, "x", + (void *) node, "x"); + } + + if (node->src1) { + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", + (void *) node->src1, "x", + (void *) node, "x"); + } + } + + fprintf(fp, "}\n"); + + fclose(fp); + + GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); +} + +//////////////////////////////////////////////////////////////////////////////// + +void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) { + int i = 0; + for (int p = 0; p < np; ++p) { + const int ne = ggml_nelements(ps[p]) ; + // TODO: add function to set tensor from array + for (int j = 0; j < ne; ++j) { + ggml_set_f32_1d(ps[p], j, x[i++]); + } + } +} + +void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) { + int i = 0; + for (int p = 0; p < np; ++p) { + const int ne = ggml_nelements(ps[p]) ; + // TODO: add function to get all elements at once + for (int j = 0; j < ne; ++j) { + x[i++] = ggml_get_f32_1d(ps[p], j); + } + } +} + +void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) { + int i = 0; + for (int p = 0; p < np; ++p) { + const int ne = ggml_nelements(ps[p]) ; + // TODO: add function to get all elements at once + for (int j = 0; j < ne; ++j) { + g[i++] = ggml_get_f32_1d(ps[p]->grad, j); + } + } +} + +// +// ADAM +// +// ref: https://arxiv.org/pdf/1412.6980.pdf +// + +enum ggml_opt_result ggml_opt_adam( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb) { + assert(ggml_is_scalar(f)); + + gf->n_threads = params.n_threads; + gb->n_threads = params.n_threads; + + // these will store the parameters we want to optimize + struct ggml_tensor * ps[GGML_MAX_PARAMS]; + + int np = 0; + int nx = 0; + for (int i = 0; i < gf->n_nodes; ++i) { + if (gf->nodes[i]->is_param) { + GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + + assert(np < GGML_MAX_PARAMS); + + ps[np++] = gf->nodes[i]; + nx += ggml_nelements(gf->nodes[i]); + } + } + + // constants + const float alpha = params.adam.alpha; + const float beta1 = params.adam.beta1; + const float beta2 = params.adam.beta2; + const float eps = params.adam.eps; + + float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters + float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient + float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared + float * m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment + float * v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment + float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat + float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat + + float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values + + // initialize + ggml_vec_set_f32(nx, m, 0.0f); + ggml_vec_set_f32(nx, v, 0.0f); + + // update view + ggml_opt_get_params(np, ps, x); + + // compute the function value + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(ctx, gb); + + float fx_prev = ggml_get_f32_1d(f, 0); + if (pf) { + pf[0] = fx_prev; + } + + int n_no_improvement = 0; + float fx_best = fx_prev; + + // run the optimizer + for (int t = 0; t < params.adam.n_iter; ++t) { + GGML_PRINT_DEBUG ("=== iter %d ===\n", t); + + GGML_PRINT_DEBUG ("f = %10.6f\n", ggml_get_f32_1d(f, 0)); + GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0)); + GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0)); + + for (int i = 0; i < np; ++i) { + GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, + ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0)); + } + + const int64_t t_start_wall = ggml_time_us(); + const int64_t t_start_cpu = ggml_cycles(); + UNUSED(t_start_wall); + UNUSED(t_start_cpu); + + { + // update the gradient + ggml_opt_get_grad(np, ps, g1); + + // m_t = beta1*m_t-1 + (1 - beta1)*g_t + ggml_vec_scale_f32(nx, m, beta1); + ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1); + + // g2 = g1^2 + ggml_vec_sqr_f32 (nx, g2, g1); + + // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2 + ggml_vec_scale_f32(nx, v, beta2); + ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2); + + // m^hat = m_t / (1 - beta1^t) + // v^hat = v_t / (1 - beta2^t) + // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps) + ggml_vec_cpy_f32 (nx, mh, m); + ggml_vec_cpy_f32 (nx, vh, v); + + ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1))); + ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, t + 1))); + + ggml_vec_sqrt_f32 (nx, vh, vh); + ggml_vec_acc1_f32 (nx, vh, eps); + + ggml_vec_div_f32 (nx, mh, mh, vh); + ggml_vec_sub_f32 (nx, x, x, mh); + + // update the parameters + ggml_opt_set_params(np, ps, x); + } + + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(ctx, gb); + + const float fx = ggml_get_f32_1d(f, 0); + + // check convergence + if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) { + GGML_PRINT_DEBUG("converged\n"); + + return GGML_OPT_OK; + } + + // delta-based convergence test + if (pf != NULL) { + // need at least params.past iterations to start checking for convergence + if (params.past <= t) { + const float rate = (pf[t%params.past] - fx)/fx; + + if (fabs(rate) < params.delta) { + return GGML_OPT_OK; + } + } + + pf[t%params.past] = fx; + } + + // check for improvement + if (params.max_no_improvement > 0) { + if (fx_best > fx) { + fx_best = fx; + n_no_improvement = 0; + } else { + ++n_no_improvement; + + if (n_no_improvement >= params.max_no_improvement) { + return GGML_OPT_OK; + } + } + } + + fx_prev = fx; + + { + const int64_t t_end_cpu = ggml_cycles(); + GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); + UNUSED(t_end_cpu); + + const int64_t t_end_wall = ggml_time_us(); + GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); + UNUSED(t_end_wall); + } + } + + return GGML_OPT_DID_NOT_CONVERGE; +} + +// +// L-BFGS +// +// the L-BFGS implementation below is based on the following implementation: +// +// https://github.com/chokkan/liblbfgs +// + +struct ggml_lbfgs_iteration_data { + float alpha; + float ys; + float * s; + float * y; +}; + +static enum ggml_opt_result linesearch_backtracking( + struct ggml_context * ctx, + const struct ggml_opt_params * params, + int nx, + float * x, + float * fx, + float * g, + float * d, + float * step, + const float * xp, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + const int np, + struct ggml_tensor * ps[]) { + int count = 0; + + float width = 0.0f; + float dg = 0.0f; + float finit = 0.0f; + float dginit = 0.0f; + float dgtest = 0.0f; + + const float dec = 0.5f; + const float inc = 2.1f; + + if (*step <= 0.) { + return GGML_LINESEARCH_INVALID_PARAMETERS; + } + + // compute the initial gradient in the search direction + ggml_vec_dot_f32(nx, &dginit, g, d); + + // make sure that d points to a descent direction + if (0 < dginit) { + return GGML_LINESEARCH_FAIL; + } + + // initialize local variables + finit = *fx; + dgtest = params->lbfgs.ftol*dginit; + + while (true) { + ggml_vec_cpy_f32(nx, x, xp); + ggml_vec_mad_f32(nx, x, d, *step); + + // evaluate the function and gradient values + { + ggml_opt_set_params(np, ps, x); + + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(ctx, gb); + + ggml_opt_get_grad(np, ps, g); + + *fx = ggml_get_f32_1d(f, 0); + } + + ++count; + + if (*fx > finit + (*step)*dgtest) { + width = dec; + } else { + // Armijo condition is satisfied + if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) { + return count; + } + + ggml_vec_dot_f32(nx, &dg, g, d); + + // check the Wolfe condition + if (dg < params->lbfgs.wolfe * dginit) { + width = inc; + } else { + if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) { + // regular Wolfe conditions + return count; + } + + if(dg > -params->lbfgs.wolfe*dginit) { + width = dec; + } else { + // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) + return count; + } + return count; + } + } + + if (*step < params->lbfgs.min_step) { + return GGML_LINESEARCH_MINIMUM_STEP; + } + if (*step > params->lbfgs.max_step) { + return GGML_LINESEARCH_MAXIMUM_STEP; + } + if (params->lbfgs.max_linesearch <= count) { + return GGML_LINESEARCH_MAXIMUM_ITERATIONS; + } + + (*step) *= width; + } + + return GGML_LINESEARCH_FAIL; +} + +enum ggml_opt_result ggml_opt_lbfgs( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb) { + if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || + params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { + if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1. <= params.lbfgs.wolfe) { + return GGML_OPT_INVALID_WOLFE; + } + } + + gf->n_threads = params.n_threads; + gb->n_threads = params.n_threads; + + const int m = params.lbfgs.m; + + // these will store the parameters we want to optimize + struct ggml_tensor * ps[GGML_MAX_PARAMS]; + + int np = 0; + int nx = 0; + for (int i = 0; i < gf->n_nodes; ++i) { + if (gf->nodes[i]->is_param) { + GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + + assert(np < GGML_MAX_PARAMS); + + ps[np++] = gf->nodes[i]; + nx += ggml_nelements(gf->nodes[i]); + } + } + + float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters + float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters + float * g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient + float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient + float * d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction + + float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values + + float fx = 0.0f; // cost function value + float xnorm = 0.0f; // ||x|| + float gnorm = 0.0f; // ||g|| + float step = 0.0f; + + // initialize x from the graph nodes + ggml_opt_get_params(np, ps, x); + + // the L-BFGS memory + struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m); + + for (int i = 0; i < m; ++i) { + lm[i].alpha = 0.0f; + lm[i].ys = 0.0f; + lm[i].s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; + lm[i].y = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; + } + + // evaluate the function value and its gradient + { + ggml_opt_set_params(np, ps, x); + + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + ggml_graph_compute(ctx, gb); + + ggml_opt_get_grad(np, ps, g); + + fx = ggml_get_f32_1d(f, 0); + } + + if (pf) { + pf[0] = fx; + } + + float fx_best = fx; + + // search direction = -gradient + ggml_vec_neg_f32(nx, d, g); + + // ||x||, ||g|| + ggml_vec_norm_f32(nx, &xnorm, x); + ggml_vec_norm_f32(nx, &gnorm, g); + + if (xnorm < 1.0f) { + xnorm = 1.0f; + } + + // already optimized + if (gnorm/xnorm <= params.lbfgs.eps) { + return GGML_OPT_OK; + } + + // initial step + ggml_vec_norm_inv_f32(nx, &step, d); + + int j = 0; + int k = 1; + int ls = 0; + int end = 0; + int bound = 0; + int n_no_improvement = 0; + + float ys = 0.0f; + float yy = 0.0f; + float beta = 0.0f; + + while (true) { + // store the current position and gradient vectors + ggml_vec_cpy_f32(nx, xp, x); + ggml_vec_cpy_f32(nx, gp, g); + + ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps); + + if (ls < 0) { + // linesearch failed - go back to the previous point and return + ggml_vec_cpy_f32(nx, x, xp); + ggml_vec_cpy_f32(nx, g, gp); + + return ls; + } + + ggml_vec_norm_f32(nx, &xnorm, x); + ggml_vec_norm_f32(nx, &gnorm, g); + + GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0)); + + if (xnorm < 1.0) { + xnorm = 1.0; + } + if (gnorm/xnorm <= params.lbfgs.eps) { + // converged + return GGML_OPT_OK; + } + + // delta-based convergence test + if (pf != NULL) { + // need at least params.past iterations to start checking for convergence + if (params.past <= k) { + const float rate = (pf[k%params.past] - fx)/fx; + + if (fabs(rate) < params.delta) { + return GGML_OPT_OK; + } + } + + pf[k%params.past] = fx; + } + + // check for improvement + if (params.max_no_improvement > 0) { + if (fx < fx_best) { + fx_best = fx; + n_no_improvement = 0; + } else { + n_no_improvement++; + + if (n_no_improvement >= params.max_no_improvement) { + return GGML_OPT_OK; + } + } + } + + if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) { + // reached the maximum number of iterations + return GGML_OPT_DID_NOT_CONVERGE; + } + + // update vectors s and y: + // s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. + // y_{k+1} = g_{k+1} - g_{k}. + // + ggml_vec_sub_f32(nx, lm[end].s, x, xp); + ggml_vec_sub_f32(nx, lm[end].y, g, gp); + + // compute scalars ys and yy: + // ys = y^t \cdot s -> 1 / \rho. + // yy = y^t \cdot y. + // + ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s); + ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y); + + lm[end].ys = ys; + + // find new search direction + // ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS + + bound = (m <= k) ? m : k; + k++; + end = (end + 1)%m; + + // initialize search direction with -g + ggml_vec_neg_f32(nx, d, g); + + j = end; + for (int i = 0; i < bound; ++i) { + j = (j + m - 1) % m; + // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} + ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d); + lm[j].alpha /= lm[j].ys; + // q_{i} = q_{i+1} - \alpha_{i} y_{i} + ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha); + } + + ggml_vec_scale_f32(nx, d, ys/yy); + + for (int i = 0; i < bound; ++i) { + // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} + ggml_vec_dot_f32(nx, &beta, lm[j].y, d); + beta /= lm[j].ys; + // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} + ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta); + j = (j + 1)%m; + } + + step = 1.0; + } + + return GGML_OPT_DID_NOT_CONVERGE; +} + +struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { + struct ggml_opt_params result; + + switch (type) { + case GGML_OPT_ADAM: + { + result = (struct ggml_opt_params) { + .type = GGML_OPT_ADAM, + .n_threads = 1, + .past = 0, + .delta = 1e-5f, + + .max_no_improvement = 100, + + .print_forward_graph = true, + .print_backward_graph = true, + + .adam = { + .n_iter = 10000, + .alpha = 0.001f, + .beta1 = 0.9f, + .beta2 = 0.999f, + .eps = 1e-8f, + .eps_f = 1e-5f, + .eps_g = 1e-3f, + }, + }; + } break; + case GGML_OPT_LBFGS: + { + result = (struct ggml_opt_params) { + .type = GGML_OPT_LBFGS, + .n_threads = 1, + .past = 0, + .delta = 1e-5f, + + .max_no_improvement = 0, + + .print_forward_graph = true, + .print_backward_graph = true, + + .lbfgs = { + .m = 6, + .n_iter = 100, + .max_linesearch = 20, + + .eps = 1e-5f, + .ftol = 1e-4f, + .wolfe = 0.9f, + .min_step = 1e-20f, + .max_step = 1e+20f, + + .linesearch = GGML_LINESEARCH_DEFAULT, + }, + }; + } break; + } + + return result; +} + +enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f) { + bool free_ctx = false; + if (ctx == NULL) { + struct ggml_init_params params_ctx = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + }; + + ctx = ggml_init(params_ctx); + if (ctx == NULL) { + return GGML_OPT_NO_CONTEXT; + } + + free_ctx = true; + } + + enum ggml_opt_result result = GGML_OPT_OK; + + // build forward + backward compute graphs + struct ggml_cgraph gf = ggml_build_forward (f); + struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, false); + + switch (params.type) { + case GGML_OPT_ADAM: + { + result = ggml_opt_adam(ctx, params, f, &gf, &gb); + } break; + case GGML_OPT_LBFGS: + { + result = ggml_opt_lbfgs(ctx, params, f, &gf, &gb); + } break; + } + + if (params.print_forward_graph) { + ggml_graph_print (&gf); + ggml_graph_dump_dot(&gf, NULL, "opt-forward.dot"); + } + + if (params.print_backward_graph) { + ggml_graph_print (&gb); + ggml_graph_dump_dot(&gb, &gf, "opt-backward.dot"); + } + + if (free_ctx) { + ggml_free(ctx); + } + + return result; +} + +//////////////////////////////////////////////////////////////////////////////// + +int ggml_cpu_has_avx(void) { +#if defined(__AVX__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx2(void) { +#if defined(__AVX2__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512(void) { +#if defined(__AVX512F__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_neon(void) { +#if defined(__ARM_NEON) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_f16c(void) { +#if defined(__F16C__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_fp16_va(void) { +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_wasm_simd(void) { +#if defined(__wasm_simd128__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_blas(void) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + return 1; +#else + return 0; +#endif +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/whisper.android/app/src/main/jni/whisper/libwhisper/ggml.h b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/ggml.h new file mode 100644 index 0000000..6dd5d04 --- /dev/null +++ b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/ggml.h @@ -0,0 +1,737 @@ +#pragma once + +// +// GGML Tensor Library +// +// This documentation is still a work in progress. +// If you wish some specific topics to be covered, feel free to drop a comment: +// +// https://github.com/ggerganov/whisper.cpp/issues/40 +// +// ## Overview +// +// This library implements: +// +// - a set of tensor operations +// - automatic differentiation +// - basic optimization algorithms +// +// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, +// but is not limited to, the following: +// +// - linear regression +// - support vector machines +// - neural networks +// +// The library allows the user to define a certain function using the available tensor operations. This function +// definition is represented internally via a computation graph. Each tensor operation in the function definition +// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the +// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized +// using one of the available optimization algorithms. +// +// For example, here we define the function: f(x) = a*x^2 + b +// +// { +// struct ggml_init_params params = { +// .mem_size = 16*1024*1024, +// .mem_buffer = NULL, +// }; +// +// // memory allocation happens here +// struct ggml_context * ctx = ggml_init(params); +// +// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// +// ggml_set_param(ctx, x); // x is an input variable +// +// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * x2 = ggml_mul(ctx, x, x); +// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); +// +// ... +// } +// +// Notice that the function definition above does not involve any actual computation. The computation is performed only +// when the user explicitly requests it. For example, to compute the function's value at x = 2.0: +// +// { +// ... +// +// struct ggml_cgraph gf = ggml_build_forward(f); +// +// // set the input variable and parameter values +// ggml_set_f32(x, 2.0f); +// ggml_set_f32(a, 3.0f); +// ggml_set_f32(b, 4.0f); +// +// ggml_graph_compute(ctx0, &gf); +// +// printf("f = %f\n", ggml_get_f32_1d(f, 0)); +// +// ... +// } +// +// The actual computation is performed in the ggml_graph_compute() function. +// +// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the +// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know +// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory +// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was +// actually needed. +// +// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic +// differentiation and optimization algorithms. +// +// The described approach allows to define the function graph once and then compute its forward or backward graphs +// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way +// the user can avoid the memory allocation overhead at runtime. +// +// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class +// citizens, but in theory the library can be extended to support FP8 and integer data types. +// +// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary +// and binary operations. Most of the available operations fall into one of these two categories. With time, it became +// clear that the library needs to support more complex operations. The way to support these operations is not clear +// yet, but a few examples are demonstrated in the following operations: +// +// - ggml_permute() +// - ggml_conv_1d_1s() +// - ggml_conv_1d_2s() +// +// For each tensor operator, the library implements a forward and backward computation function. The forward function +// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the +// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a +// calculus class, or watch the following video: +// +// What is Automatic Differentiation? +// https://www.youtube.com/watch?v=wG_nF1awSSY +// +// +// ## Tensor data (struct ggml_tensor) +// +// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of +// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains +// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: +// +// { +// struct ggml_tensor * c = ggml_add(ctx, a, b); +// +// assert(c->src[0] == a); +// assert(c->src[1] == b); +// } +// +// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the +// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows +// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and +// permutation. All tensor operations have to take the stride into account and not assume that the tensor is +// contiguous in memory. +// +// The data of the tensor is accessed via the "data" pointer. For example: +// +// { +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); +// +// // a[1, 2] = 1.0f; +// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; +// +// // a[2, 0] = 2.0f; +// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; +// +// ... +// } +// +// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. +// +// ## The matrix multiplication operator (ggml_mul_mat) +// +// TODO +// +// +// ## Multi-threading +// +// TODO +// +// +// ## Overview of ggml.c +// +// TODO +// +// +// ## SIMD optimizations +// +// TODO +// +// +// ## Debugging ggml +// +// TODO +// +// + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#define GGML_MAX_DIMS 4 +#define GGML_MAX_NODES 4096 +#define GGML_MAX_PARAMS 16 +#define GGML_MAX_CONTEXTS 64 +#define GGML_MAX_OPT 4 + +#ifdef __ARM_NEON +// we use the built-in 16-bit float type +typedef __fp16 ggml_fp16_t; +#else +typedef uint16_t ggml_fp16_t; +#endif + +// convert FP16 <-> FP32 +float ggml_fp16_to_fp32(ggml_fp16_t x); +ggml_fp16_t ggml_fp32_to_fp16(float x); + +struct ggml_object; +struct ggml_context; + +enum ggml_type { + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_F16, + GGML_TYPE_F32, + GGML_TYPE_COUNT, +}; + +// available tensor operations: +enum ggml_op { + GGML_OP_NONE = 0, + + GGML_OP_DUP, + GGML_OP_ADD, + GGML_OP_SUB, + GGML_OP_MUL, + GGML_OP_DIV, + GGML_OP_SQR, + GGML_OP_SQRT, + GGML_OP_SUM, + GGML_OP_MEAN, + GGML_OP_REPEAT, + GGML_OP_ABS, + GGML_OP_SGN, + GGML_OP_NEG, + GGML_OP_STEP, + GGML_OP_RELU, + GGML_OP_GELU, + GGML_OP_NORM, // normalize + + GGML_OP_MUL_MAT, + + GGML_OP_SCALE, + GGML_OP_CPY, + GGML_OP_RESHAPE, + GGML_OP_VIEW, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_DIAG_MASK_INF, + GGML_OP_SOFT_MAX, + GGML_OP_ROPE, + GGML_OP_CONV_1D_1S, + GGML_OP_CONV_1D_2S, + + GGML_OP_FLASH_ATTN, + GGML_OP_FLASH_FF, + + GGML_OP_COUNT, +}; + +// n-dimensional tensor +struct ggml_tensor { + enum ggml_type type; + + int n_dims; + int ne[GGML_MAX_DIMS]; // number of elements + size_t nb[GGML_MAX_DIMS]; // stride in bytes: + // nb[0] = sizeof(type) + // nb[1] = nb[0] * ne[0] + padding + // nb[i] = nb[i-1] * ne[i-1] + + // compute data + enum ggml_op op; + + bool is_param; + + struct ggml_tensor * grad; + struct ggml_tensor * src0; + struct ggml_tensor * src1; + struct ggml_tensor * opt[GGML_MAX_OPT]; + + // thread scheduling + int n_tasks; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + + void * data; + char padding[8]; +}; + +// computation graph +struct ggml_cgraph { + int n_nodes; + int n_leafs; + int n_threads; + + size_t work_size; + struct ggml_tensor * work; + + struct ggml_tensor * nodes[GGML_MAX_NODES]; + struct ggml_tensor * grads[GGML_MAX_NODES]; + struct ggml_tensor * leafs[GGML_MAX_NODES]; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; +}; + +struct ggml_init_params { + // memory pool + size_t mem_size; // bytes + void * mem_buffer; // if NULL, memory will be allocated internally +}; + +void ggml_time_init(void); // call this once at the beginning of the program +int64_t ggml_time_ms(void); +int64_t ggml_time_us(void); +int64_t ggml_cycles(void); +int64_t ggml_cycles_per_ms(void); + +void ggml_print_object (const struct ggml_object * obj); +void ggml_print_objects(const struct ggml_context * ctx); + +int ggml_nelements(const struct ggml_tensor * tensor); +size_t ggml_nbytes (const struct ggml_tensor * tensor); + +size_t ggml_type_size (enum ggml_type type); +size_t ggml_element_size(const struct ggml_tensor * tensor); + +struct ggml_context * ggml_init(struct ggml_init_params params); +void ggml_free(struct ggml_context * ctx); + +size_t ggml_used_mem(const struct ggml_context * ctx); + +struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int *ne); + +struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0); + +struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1); + +struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1, + int ne2); + +struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1, + int ne2, + int ne3); + +struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); +struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + +struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); +struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + +struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); +struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); +struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + +int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); +void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + +float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); +void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + + void * ggml_get_data (const struct ggml_tensor * tensor); +float * ggml_get_data_f32(const struct ggml_tensor * tensor); + +// +// operations on tensors with backpropagation +// + +struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a); + +struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a); + +struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a); + +// return scalar +// TODO: compute sum along rows +struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a); + +// mean along rows +struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a); + +// if a is the same shape as b, and a is not parameter, return a +// otherwise, return a new tensor: repeat(a) to fit in b +struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a); + +struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a); + +struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a); + +struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a); + +struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a); + +// TODO: double-check this computation is correct +struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a); + +// normalize along rows +// TODO: eps is hardcoded to 1e-5 for now +struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a); + +// A: m rows, n columns +// B: p rows, n columns (i.e. we transpose it internally) +// result is m columns, p rows +struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +// +// operations on tensors without backpropagation +// + +// in-place, returns view(a) +struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +// a -> b, return view(b) +struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +// return view(a), b specifies the new shape +// TODO: when we start computing gradient, make a copy instead of view +struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +// return view(a) +// TODO: when we start computing gradient, make a copy instead of view +struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1); + +// return view(a) +// TODO: when we start computing gradient, make a copy instead of view +struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + int ne2); + +// offset in bytes +struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + size_t offset); + +struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + size_t nb1, // row stride in bytes + size_t offset); + +struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); + +// alias for ggml_permute(ctx, a, 1, 0, 2, 3) +struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a); + +struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +// set elements above the diagonal to -INF +// in-place, returns view(a) +struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + +// in-place, returns view(a) +struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a); + +// rotary position embedding +// in-place, returns view(a) +// if mode == 1, skip n_past elements +// TODO: avoid creating a new tensor every time +struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode); + +// padding = 1 +// TODO: we don't support extra parameters for now +// that's why we are hard-coding the stride, padding, and dilation +// not great .. +struct ggml_tensor * ggml_conv_1d_1s( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +struct ggml_tensor * ggml_conv_1d_2s( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + +struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked); + +struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1); + +// +// automatic differentiation +// + +void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor); + +void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + +struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); +struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); + +void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); +void ggml_graph_reset (struct ggml_cgraph * cgraph); + +// print info and performance information for the graph +void ggml_graph_print(const struct ggml_cgraph * cgraph); + +// dump the graph into a file using the dot format +void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + +// +// optimization +// + +// optimization methods +enum ggml_opt_type { + GGML_OPT_ADAM, + GGML_OPT_LBFGS, +}; + +// linesearch methods +enum ggml_linesearch { + GGML_LINESEARCH_DEFAULT = 1, + + GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, + GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, + GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, +}; + +// optimization return values +enum ggml_opt_result { + GGML_OPT_OK = 0, + GGML_OPT_DID_NOT_CONVERGE, + GGML_OPT_NO_CONTEXT, + GGML_OPT_INVALID_WOLFE, + GGML_OPT_FAIL, + + GGML_LINESEARCH_FAIL = -128, + GGML_LINESEARCH_MINIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_ITERATIONS, + GGML_LINESEARCH_INVALID_PARAMETERS, +}; + +// optimization parameters +// +// see ggml.c (ggml_opt_default_params) for default values +// +struct ggml_opt_params { + enum ggml_opt_type type; + + int n_threads; + + // delta-based convergence test + // + // if past == 0 - disabled + // if past > 0: + // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) + // + int past; + float delta; + + // maximum number of iterations without improvement + // + // if 0 - disabled + // if > 0: + // assume convergence if no cost improvement in this number of iterations + // + int max_no_improvement; + + bool print_forward_graph; + bool print_backward_graph; + + union { + // ADAM parameters + struct { + int n_iter; + + float alpha; // learning rate + float beta1; + float beta2; + float eps; // epsilon for numerical stability + float eps_f; // epsilon for convergence test + float eps_g; // epsilon for convergence test + } adam; + + // LBFGS parameters + struct { + int m; // number of corrections to approximate the inv. Hessian + int n_iter; + int max_linesearch; + + float eps; // convergence tolerance + float ftol; // line search tolerance + float wolfe; + float min_step; + float max_step; + + enum ggml_linesearch linesearch; + } lbfgs; + }; +}; + +struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); + +// optimize the function defined by the tensor f +enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f); + +// +// system info +// + +int ggml_cpu_has_avx(void); +int ggml_cpu_has_avx2(void); +int ggml_cpu_has_avx512(void); +int ggml_cpu_has_neon(void); +int ggml_cpu_has_f16c(void); +int ggml_cpu_has_fp16_va(void); +int ggml_cpu_has_wasm_simd(void); +int ggml_cpu_has_blas(void); + +#ifdef __cplusplus +} +#endif diff --git a/examples/whisper.android/app/src/main/jni/whisper/libwhisper/whisper.cpp b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/whisper.cpp new file mode 100644 index 0000000..1bc7996 --- /dev/null +++ b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/whisper.cpp @@ -0,0 +1,3374 @@ +#define WHISPER_BUILD +#include "whisper.h" + +#include "ggml.h" + +#include +#include +#define _USE_MATH_DEFINES +#include +#include +#include +#include +#include +#include +#include +#include + +#define USE_FLASH_ATTN +//#define USE_FLASH_FF + +// available whisper models +enum e_model { + MODEL_UNKNOWN, + MODEL_TINY, + MODEL_BASE, + MODEL_SMALL, + MODEL_MEDIUM, + MODEL_LARGE, +}; + +static const std::map> g_lang = { + { "en", { 0, "english", } }, + { "zh", { 1, "chinese", } }, + { "de", { 2, "german", } }, + { "es", { 3, "spanish", } }, + { "ru", { 4, "russian", } }, + { "ko", { 5, "korean", } }, + { "fr", { 6, "french", } }, + { "ja", { 7, "japanese", } }, + { "pt", { 8, "portuguese", } }, + { "tr", { 9, "turkish", } }, + { "pl", { 10, "polish", } }, + { "ca", { 11, "catalan", } }, + { "nl", { 12, "dutch", } }, + { "ar", { 13, "arabic", } }, + { "sv", { 14, "swedish", } }, + { "it", { 15, "italian", } }, + { "id", { 16, "indonesian", } }, + { "hi", { 17, "hindi", } }, + { "fi", { 18, "finnish", } }, + { "vi", { 19, "vietnamese", } }, + { "iw", { 20, "hebrew", } }, + { "uk", { 21, "ukrainian", } }, + { "el", { 22, "greek", } }, + { "ms", { 23, "malay", } }, + { "cs", { 24, "czech", } }, + { "ro", { 25, "romanian", } }, + { "da", { 26, "danish", } }, + { "hu", { 27, "hungarian", } }, + { "ta", { 28, "tamil", } }, + { "no", { 29, "norwegian", } }, + { "th", { 30, "thai", } }, + { "ur", { 31, "urdu", } }, + { "hr", { 32, "croatian", } }, + { "bg", { 33, "bulgarian", } }, + { "lt", { 34, "lithuanian", } }, + { "la", { 35, "latin", } }, + { "mi", { 36, "maori", } }, + { "ml", { 37, "malayalam", } }, + { "cy", { 38, "welsh", } }, + { "sk", { 39, "slovak", } }, + { "te", { 40, "telugu", } }, + { "fa", { 41, "persian", } }, + { "lv", { 42, "latvian", } }, + { "bn", { 43, "bengali", } }, + { "sr", { 44, "serbian", } }, + { "az", { 45, "azerbaijani", } }, + { "sl", { 46, "slovenian", } }, + { "kn", { 47, "kannada", } }, + { "et", { 48, "estonian", } }, + { "mk", { 49, "macedonian", } }, + { "br", { 50, "breton", } }, + { "eu", { 51, "basque", } }, + { "is", { 52, "icelandic", } }, + { "hy", { 53, "armenian", } }, + { "ne", { 54, "nepali", } }, + { "mn", { 55, "mongolian", } }, + { "bs", { 56, "bosnian", } }, + { "kk", { 57, "kazakh", } }, + { "sq", { 58, "albanian", } }, + { "sw", { 59, "swahili", } }, + { "gl", { 60, "galician", } }, + { "mr", { 61, "marathi", } }, + { "pa", { 62, "punjabi", } }, + { "si", { 63, "sinhala", } }, + { "km", { 64, "khmer", } }, + { "sn", { 65, "shona", } }, + { "yo", { 66, "yoruba", } }, + { "so", { 67, "somali", } }, + { "af", { 68, "afrikaans", } }, + { "oc", { 69, "occitan", } }, + { "ka", { 70, "georgian", } }, + { "be", { 71, "belarusian", } }, + { "tg", { 72, "tajik", } }, + { "sd", { 73, "sindhi", } }, + { "gu", { 74, "gujarati", } }, + { "am", { 75, "amharic", } }, + { "yi", { 76, "yiddish", } }, + { "lo", { 77, "lao", } }, + { "uz", { 78, "uzbek", } }, + { "fo", { 79, "faroese", } }, + { "ht", { 80, "haitian creole", } }, + { "ps", { 81, "pashto", } }, + { "tk", { 82, "turkmen", } }, + { "nn", { 83, "nynorsk", } }, + { "mt", { 84, "maltese", } }, + { "sa", { 85, "sanskrit", } }, + { "lb", { 86, "luxembourgish", } }, + { "my", { 87, "myanmar", } }, + { "bo", { 88, "tibetan", } }, + { "tl", { 89, "tagalog", } }, + { "mg", { 90, "malagasy", } }, + { "as", { 91, "assamese", } }, + { "tt", { 92, "tatar", } }, + { "haw", { 93, "hawaiian", } }, + { "ln", { 94, "lingala", } }, + { "ha", { 95, "hausa", } }, + { "ba", { 96, "bashkir", } }, + { "jw", { 97, "javanese", } }, + { "su", { 98, "sundanese", } }, +}; + +static const size_t MB = 1024*1024; + +static const std::map MEM_REQ_MODEL = { + { MODEL_TINY, 74ull*MB }, + { MODEL_BASE, 142ull*MB }, + { MODEL_SMALL, 466ull*MB }, + { MODEL_MEDIUM, 1464ull*MB }, + { MODEL_LARGE, 2952ull*MB }, +}; + +static const std::map MEM_REQ_MEMORY = { + { MODEL_TINY, 12ull*MB }, + { MODEL_BASE, 24ull*MB }, + { MODEL_SMALL, 70ull*MB }, + { MODEL_MEDIUM, 184ull*MB }, + { MODEL_LARGE, 306ull*MB }, +}; + +static const std::map MEM_REQ_ENCODE = { + { MODEL_TINY, 80ull*MB }, + { MODEL_BASE, 128ull*MB }, + { MODEL_SMALL, 300ull*MB }, + { MODEL_MEDIUM, 680ull*MB }, + { MODEL_LARGE, 1100ull*MB }, +}; + +static const std::map MEM_REQ_ENCODE_LAYER = { + { MODEL_TINY, 104ull*MB }, + { MODEL_BASE, 138ull*MB }, + { MODEL_SMALL, 208ull*MB }, + { MODEL_MEDIUM, 280ull*MB }, + { MODEL_LARGE, 354ull*MB }, +}; + +static const std::map MEM_REQ_DECODE = { + { MODEL_TINY, 200ull*MB }, + { MODEL_BASE, 202ull*MB }, + { MODEL_SMALL, 204ull*MB }, + { MODEL_MEDIUM, 206ull*MB }, + { MODEL_LARGE, 208ull*MB }, +}; + +static const std::map MEM_REQ_DECODE_LAYER = { + { MODEL_TINY, 32ull*MB }, + { MODEL_BASE, 44ull*MB }, + { MODEL_SMALL, 64ull*MB }, + { MODEL_MEDIUM, 84ull*MB }, + { MODEL_LARGE, 110ull*MB }, +}; + +struct whisper_mel { + int n_len; + int n_mel; + + std::vector data; +}; + +struct whisper_filters { + int32_t n_mel; + int32_t n_fft; + + std::vector data; +}; + +struct whisper_vocab { + using id = int32_t; + using token = std::string; + + int n_vocab = 51864; + + std::map token_to_id; + std::map id_to_token; + + id token_eot = 50256; + id token_sot = 50257; + id token_prev = 50360; + id token_solm = 50361; // ?? + id token_not = 50362; // no timestamps + id token_beg = 50363; + + // available tasks + static const id token_translate = 50358; + static const id token_transcribe = 50359; + + bool is_multilingual() const { + return n_vocab == 51865; + } +}; + +struct whisper_segment { + int64_t t0; + int64_t t1; + + std::string text; + + std::vector tokens; +}; + +// medium +// hparams: { +// 'n_mels': 80, +// 'n_vocab': 51864, +// 'n_audio_ctx': 1500, +// 'n_audio_state': 1024, +// 'n_audio_head': 16, +// 'n_audio_layer': 24, +// 'n_text_ctx': 448, +// 'n_text_state': 1024, +// 'n_text_head': 16, +// 'n_text_layer': 24 +// } +// +// default hparams (Whisper tiny) +struct whisper_hparams { + int32_t n_vocab = 51864; + int32_t n_audio_ctx = 1500; + int32_t n_audio_state = 384; + int32_t n_audio_head = 6; + int32_t n_audio_layer = 4; + int32_t n_text_ctx = 448; + int32_t n_text_state = 384; + int32_t n_text_head = 6; + int32_t n_text_layer = 4; + int32_t n_mels = 80; + int32_t f16 = 1; +}; + +// audio encoding layer +struct whisper_layer_encoder { + // encoder.blocks.*.attn_ln + struct ggml_tensor * attn_ln_0_w; + struct ggml_tensor * attn_ln_0_b; + + // encoder.blocks.*.attn.out + struct ggml_tensor * attn_ln_1_w; + struct ggml_tensor * attn_ln_1_b; + + // encoder.blocks.*.attn.query + struct ggml_tensor * attn_q_w; + struct ggml_tensor * attn_q_b; + + // encoder.blocks.*.attn.key + struct ggml_tensor * attn_k_w; + + // encoder.blocks.*.attn.value + struct ggml_tensor * attn_v_w; + struct ggml_tensor * attn_v_b; + + // encoder.blocks.*.mlp_ln + struct ggml_tensor * mlp_ln_w; + struct ggml_tensor * mlp_ln_b; + + // encoder.blocks.*.mlp.0 + struct ggml_tensor * mlp_0_w; + struct ggml_tensor * mlp_0_b; + + // encoder.blocks.*.mlp.2 + struct ggml_tensor * mlp_1_w; + struct ggml_tensor * mlp_1_b; +}; + +// token decoding layer +struct whisper_layer_decoder { + // decoder.blocks.*.attn_ln + struct ggml_tensor * attn_ln_0_w; + struct ggml_tensor * attn_ln_0_b; + + // decoder.blocks.*.attn.out + struct ggml_tensor * attn_ln_1_w; + struct ggml_tensor * attn_ln_1_b; + + // decoder.blocks.*.attn.query + struct ggml_tensor * attn_q_w; + struct ggml_tensor * attn_q_b; + + // decoder.blocks.*.attn.key + struct ggml_tensor * attn_k_w; + + // decoder.blocks.*.attn.value + struct ggml_tensor * attn_v_w; + struct ggml_tensor * attn_v_b; + + // decoder.blocks.*.cross_attn_ln + struct ggml_tensor * cross_attn_ln_0_w; + struct ggml_tensor * cross_attn_ln_0_b; + + // decoder.blocks.*.cross_attn.out + struct ggml_tensor * cross_attn_ln_1_w; + struct ggml_tensor * cross_attn_ln_1_b; + + // decoder.blocks.*.cross_attn.query + struct ggml_tensor * cross_attn_q_w; + struct ggml_tensor * cross_attn_q_b; + + // decoder.blocks.*.cross_attn.key + struct ggml_tensor * cross_attn_k_w; + + // decoder.blocks.*.cross_attn.value + struct ggml_tensor * cross_attn_v_w; + struct ggml_tensor * cross_attn_v_b; + + // decoder.blocks.*.mlp_ln + struct ggml_tensor * mlp_ln_w; + struct ggml_tensor * mlp_ln_b; + + // decoder.blocks.*.mlp.0 + struct ggml_tensor * mlp_0_w; + struct ggml_tensor * mlp_0_b; + + // decoder.blocks.*.mlp.2 + struct ggml_tensor * mlp_1_w; + struct ggml_tensor * mlp_1_b; +}; + +struct whisper_model { + e_model type = MODEL_UNKNOWN; + + whisper_hparams hparams; + whisper_filters filters; + + // encoder.positional_embedding + struct ggml_tensor * e_pe; + + // encoder.conv1 + struct ggml_tensor * e_conv_1_w; + struct ggml_tensor * e_conv_1_b; + + // encoder.conv2 + struct ggml_tensor * e_conv_2_w; + struct ggml_tensor * e_conv_2_b; + + // encoder.ln_post + struct ggml_tensor * e_ln_w; + struct ggml_tensor * e_ln_b; + + // decoder.positional_embedding + struct ggml_tensor * d_pe; // DD + + // decoder.token_embedding + struct ggml_tensor * d_te; // DD + + // decoder.ln + struct ggml_tensor * d_ln_w; // DD + struct ggml_tensor * d_ln_b; // DD + + std::vector layers_encoder; + std::vector layers_decoder; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + struct ggml_tensor * memory_cross_k; + struct ggml_tensor * memory_cross_v; + + // context + struct ggml_context * ctx; + struct ggml_context * ctx_mem; + + // tensors + int n_loaded; + std::map tensors; +}; + +struct whisper_context { + int64_t t_load_us = 0; + int64_t t_mel_us = 0; + int64_t t_sample_us = 0; + int64_t t_encode_us = 0; + int64_t t_decode_us = 0; + int64_t t_start_us = 0; + + std::vector * buf_model; // the model buffer is read-only and can be shared between processors + std::vector buf_memory; + std::vector buf_compute; + std::vector buf_compute_layer; + + whisper_model model; + whisper_vocab vocab; + + whisper_mel mel; + + std::vector probs; + std::vector logits; + + std::vector result_all; + + std::vector prompt_past; + + // [EXPERIMENTAL] token-level timestamps data + int64_t t_beg; + int64_t t_last; + whisper_token tid_last; + std::vector energy; // PCM signal energy + + // [EXPERIMENTAL] speed-up techniques + int32_t exp_n_audio_ctx; // 0 - use default +}; + +template +static void read_safe(std::ifstream& fin, T& dest) +{ + fin.read((char*)& dest, sizeof(T)); +} + +// load the model from a ggml file +// +// file format: +// +// - hparams +// - pre-computed mel filters +// - vocab +// - weights +// +// see the convert-pt-to-ggml.py script for details +// +static bool whisper_model_load(const std::string & fname, whisper_context & wctx) { + fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str()); + + auto & model = wctx.model; + auto & vocab = wctx.vocab; + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + read_safe(fin, magic); + if (magic != 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + //load hparams + { + auto & hparams = model.hparams; + + read_safe(fin, hparams.n_vocab); + read_safe(fin, hparams.n_audio_ctx); + read_safe(fin, hparams.n_audio_state); + read_safe(fin, hparams.n_audio_head); + read_safe(fin, hparams.n_audio_layer); + read_safe(fin, hparams.n_text_ctx); + read_safe(fin, hparams.n_text_state); + read_safe(fin, hparams.n_text_head); + read_safe(fin, hparams.n_text_layer); + read_safe(fin, hparams.n_mels); + read_safe(fin, hparams.f16); + + assert(hparams.n_text_state == hparams.n_audio_state); + + if (hparams.n_audio_layer == 4) { + model.type = e_model::MODEL_TINY; + } + + if (hparams.n_audio_layer == 6) { + model.type = e_model::MODEL_BASE; + } + + if (hparams.n_audio_layer == 12) { + model.type = e_model::MODEL_SMALL; + } + + if (hparams.n_audio_layer == 24) { + model.type = e_model::MODEL_MEDIUM; + } + + if (hparams.n_audio_layer == 32) { + model.type = e_model::MODEL_LARGE; + } + + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); + fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state); + fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head); + fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer); + fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx); + fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state); + fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head); + fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); + fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels); + fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: type = %d\n", __func__, model.type); + + wctx.buf_model = new std::vector(); + wctx.buf_model->resize(MEM_REQ_MODEL.at(model.type)); + wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type)); + wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type))); + wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type))); + } + + // load mel filters + { + auto & filters = wctx.model.filters; + + read_safe(fin, filters.n_mel); + read_safe(fin, filters.n_fft); + + filters.data.resize(filters.n_mel * filters.n_fft); + fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float)); + } + + // load vocab + { + int32_t n_vocab = 0; + read_safe(fin, n_vocab); + + //if (n_vocab != model.hparams.n_vocab) { + // fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + // __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + // return false; + //} + + std::string word; + std::vector tmp; + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + read_safe(fin, len); + + if (len > 0) { + tmp.resize(len); + fin.read(&tmp[0], tmp.size()); // read to buffer + word.assign(&tmp[0], tmp.size()); + } else { + // seems like we have an empty-string token in multi-language models (i = 50256) + //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i); + word = ""; + } + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + + //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); + } + + vocab.n_vocab = model.hparams.n_vocab; + if (vocab.is_multilingual()) { + vocab.token_eot++; + vocab.token_sot++; + vocab.token_prev++; + vocab.token_solm++; + vocab.token_not++; + vocab.token_beg++; + } + + if (n_vocab < model.hparams.n_vocab) { + fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab); + for (int i = n_vocab; i < model.hparams.n_vocab; i++) { + if (i > vocab.token_beg) { + word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]"; + } else if (i == vocab.token_eot) { + word = "[_EOT_]"; + } else if (i == vocab.token_sot) { + word = "[_SOT_]"; + } else if (i == vocab.token_prev) { + word = "[_PREV_]"; + } else if (i == vocab.token_not) { + word = "[_NOT_]"; + } else if (i == vocab.token_beg) { + word = "[_BEG_]"; + } else { + word = "[_extra_token_" + std::to_string(i) + "]"; + } + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + } + + { + // this is the total memory required to run the inference + const size_t mem_required = + wctx.buf_model->size() + + wctx.buf_memory.size() + + wctx.buf_compute.size() + + wctx.buf_compute_layer.size(); + + fprintf(stderr, "%s: mem_required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); + } + + // for the big tensors, we have the option to store the data in 16-bit floats + // in order to save memory and also to speed up the computation + const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; + + size_t ctx_size = 0; + size_t ctx_mem_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + const int n_audio_ctx = hparams.n_audio_ctx; + const int n_audio_state = hparams.n_audio_state; + const int n_audio_layer = hparams.n_audio_layer; + + const int n_text_ctx = hparams.n_text_ctx; + const int n_text_state = hparams.n_text_state; + const int n_text_layer = hparams.n_text_layer; + + const int n_mels = hparams.n_mels; + + // encoder + { + // TODO: F16 .. maybe not? + ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe; + + ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype); // e_conv_1_w + ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b + + ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype); // e_conv_2_w + ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b + + ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w; + ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b; + } + + // decoder + { + // TODO: F16 .. maybe not? + ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe; + + ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te; + + ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w; + ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b; + } + + // encoder layers + { + ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b + + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_0_w + ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b + + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b + + ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b + + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_q_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b + + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w + + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_v_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b + + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_ln_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b + } + + // decoder layers + { + ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b + + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_0_w + ctx_size += n_text_layer*( 4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b + + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b + + ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b + // + ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b + + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b + } + + ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_k + ctx_mem_size += n_text_layer*n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_v + + ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_k + ctx_mem_size += n_text_layer*n_audio_ctx*n_text_state*ggml_type_size(GGML_TYPE_F16); // memory_cross_v + + ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead + + fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params; + params.mem_size = wctx.buf_model->size(); + params.mem_buffer = wctx.buf_model->data(); + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // prepare memory for the weights + { + auto & ctx = model.ctx; + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + const int n_audio_ctx = hparams.n_audio_ctx; + const int n_audio_state = hparams.n_audio_state; + const int n_audio_layer = hparams.n_audio_layer; + + const int n_text_ctx = hparams.n_text_ctx; + const int n_text_state = hparams.n_text_state; + const int n_text_layer = hparams.n_text_layer; + + const int n_mels = hparams.n_mels; + + model.layers_encoder.resize(n_audio_layer); + model.layers_decoder.resize(n_text_layer); + + // encoder + { + model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx); + + model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype, 3, n_mels, n_audio_state); + model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); + + model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype, 3, n_audio_state, n_audio_state); + model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); + + model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + + // map by name + model.tensors["encoder.positional_embedding"] = model.e_pe; + + model.tensors["encoder.conv1.weight"] = model.e_conv_1_w; + model.tensors["encoder.conv1.bias"] = model.e_conv_1_b; + + model.tensors["encoder.conv2.weight"] = model.e_conv_2_w; + model.tensors["encoder.conv2.bias"] = model.e_conv_2_b; + + model.tensors["encoder.ln_post.weight"] = model.e_ln_w; + model.tensors["encoder.ln_post.bias"] = model.e_ln_b; + + for (int i = 0; i < n_audio_layer; ++i) { + auto & layer = model.layers_encoder[i]; + + layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + + layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state); + layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state); + + layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state); + layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + + layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + + layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + + layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + + layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + + layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + + // map by name + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b; + + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b; + + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b; + + model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b; + + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b; + + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w; + + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b; + + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b; + } + } + + // decoder + { + model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx); + + model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab); + + model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + // map by name + model.tensors["decoder.positional_embedding"] = model.d_pe; + + model.tensors["decoder.token_embedding.weight"] = model.d_te; + + model.tensors["decoder.ln.weight"] = model.d_ln_w; + model.tensors["decoder.ln.bias"] = model.d_ln_b; + + for (int i = 0; i < n_text_layer; ++i) { + auto & layer = model.layers_decoder[i]; + + layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state); + layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state); + + layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state); + layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + + layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + + layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + + // map by name + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w; + + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w; + + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b; + + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b; + } + } + } + + // create the ggml memory context + { + struct ggml_init_params params; + params.mem_size = wctx.buf_memory.size(); + params.mem_buffer = wctx.buf_memory.data(); + + model.ctx_mem = ggml_init(params); + if (!model.ctx_mem) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // key + value memory + { + auto & ctx = model.ctx_mem; + + const auto & hparams = model.hparams; + + const int n_text_state = hparams.n_text_state; + const int n_text_layer = hparams.n_text_layer; + const int n_text_ctx = hparams.n_text_ctx; + + // key/value memory for the self-attention layer + { + const int n_mem = n_text_layer*n_text_ctx; + const int n_elements = n_text_state*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + } + + // key/value memory for the cross-attention layer + { + const int n_audio_ctx = hparams.n_audio_ctx; + + const int n_mem = n_text_layer*n_audio_ctx; + const int n_elements = n_text_state*n_mem; + + model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + } + + const size_t memory_size = + ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v) + + ggml_nbytes(model.memory_cross_k) + ggml_nbytes(model.memory_cross_v); + + fprintf(stderr, "%s: memory size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0); + } + + // load weights + { + size_t total_size = 0; + + model.n_loaded = 0; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ftype; + + read_safe(fin, n_dims); + read_safe(fin, length); + read_safe(fin, ftype); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[3] = { 1, 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + read_safe(fin, ne[i]); + nelements *= ne[i]; + } + + std::string name; + std::vector tmp(length); // create a buffer + fin.read( &tmp[0], tmp.size() ); // read to buffer + name.assign(&tmp[0], tmp.size()); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + auto tensor = model.tensors[name.data()]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]); + return false; + } + + const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); + + if (nelements*bpe != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + total_size += ggml_nbytes(tensor); + model.n_loaded++; + } + + fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0); + + if (model.n_loaded == 0) { + fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); + } else if (model.n_loaded != (int) model.tensors.size()) { + fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); + return false; + } + } + + fin.close(); + + return true; +} + +// evaluate the encoder +// +// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder +// part of the transformer model and returns the encoded features +// +// - model: the model +// - n_threads: number of threads to use +// - mel_offset: offset in the mel spectrogram (i.e. audio offset) +// +static bool whisper_encode( + whisper_context & wctx, + const int n_threads, + const int mel_offset) { + const auto & model = wctx.model; + const auto & mel_inp = wctx.mel; + const auto & hparams = model.hparams; + + const int n_ctx = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx; + const int n_state = hparams.n_audio_state; + const int n_head = hparams.n_audio_head; + const int n_layer = hparams.n_audio_layer; + + const int n_mels = hparams.n_mels; + assert(mel_inp.n_mel == n_mels); + + struct ggml_init_params params; + params.mem_size = wctx.buf_compute.size(); + params.mem_buffer = wctx.buf_compute.data(); + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels); + assert(mel->type == GGML_TYPE_F32); + { + float * dst = (float *) mel->data; + memset(dst, 0, ggml_nbytes(mel)); + + const int i0 = std::min(mel_offset, mel_inp.n_len); + const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len); + + for (int j = 0; j < mel_inp.n_mel; ++j) { + for (int i = i0; i < i1; ++i) { + dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i]; + } + } + } + + struct ggml_tensor * cur; + + // convolution + gelu + { + cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, + model.e_conv_1_b, + cur), + cur); + + cur = ggml_gelu(ctx0, cur); + + cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, + model.e_conv_2_b, + cur), + cur); + + cur = ggml_gelu(ctx0, cur); + } + + // =================================================================== + // NOTE: experimenting with partial evaluation of the encoder (ignore) + //static int iter = -1; + //const int n_iter = 1500/n_ctx; + + //iter = (iter + 1) % n_iter; + + //if (iter == 0) { + // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k)); + // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v)); + //} + + static int iter = 0; + + const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe); + const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter; + + struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset); + + cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur)); + // =================================================================== + + // original: + //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur)); + + struct ggml_tensor * inpL = cur; + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers_encoder[il]; + + // create separate context for each layer to reduce memory usage + + struct ggml_init_params paramsL; + paramsL.mem_size = wctx.buf_compute_layer.size(); + paramsL.mem_buffer = wctx.buf_compute_layer.data(); + + struct ggml_context * ctxL = ggml_init(paramsL); + + // norm + { + cur = ggml_norm(ctxL, inpL); + + // cur = ln_0_w*cur + ln_0_b + cur = ggml_add(ctxL, + ggml_mul(ctxL, + ggml_repeat(ctxL, layer.attn_ln_0_w, cur), + cur), + ggml_repeat(ctxL, layer.attn_ln_0_b, cur)); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctxL, + layer.attn_q_w, + cur); + + Qcur = ggml_add(ctxL, + ggml_repeat(ctxL, + layer.attn_q_b, + Qcur), + Qcur); + + //Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25))); + + // note: no bias for Key + struct ggml_tensor * Kcur = ggml_mul_mat(ctxL, + layer.attn_k_w, + cur); + + //Kcur = ggml_scale(ctxL, Kcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25))); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctxL, + layer.attn_v_w, + cur); + + Vcur = ggml_add(ctxL, + ggml_repeat(ctxL, + layer.attn_v_b, + Vcur), + Vcur); + + // ------ + +#ifdef USE_FLASH_ATTN + struct ggml_tensor * Q = + ggml_permute(ctxL, + ggml_cpy(ctxL, + Qcur, + ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctxL, + ggml_cpy(ctxL, + Kcur, + ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * V = + ggml_cpy(ctxL, + ggml_permute(ctxL, + ggml_reshape_3d(ctxL, + Vcur, + n_state/n_head, n_head, n_ctx), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_ctx, n_state/n_head, n_head) + ); + + struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false); +#else + struct ggml_tensor * Q = + ggml_permute(ctxL, + ggml_cpy(ctxL, + Qcur, + ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctxL, + ggml_cpy(ctxL, + Kcur, + ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q); + + struct ggml_tensor * KQ_scaled = + ggml_scale(ctxL, + KQ, + ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head)) + ); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ_scaled); + + //struct ggml_tensor * V_trans = + // ggml_permute(ctxL, + // ggml_cpy(ctxL, + // Vcur, + // ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)), + // 1, 2, 0, 3); + + //struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max); + + struct ggml_tensor * V = + ggml_cpy(ctxL, + ggml_permute(ctxL, + ggml_reshape_3d(ctxL, + Vcur, + n_state/n_head, n_head, n_ctx), + 0, 2, 1, 3), + ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_ctx, n_head) + ); + + struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max); +#endif + + struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3); + + cur = ggml_cpy(ctxL, + KQV_merged, + ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, n_ctx)); + } + + // projection + { + cur = ggml_mul_mat(ctxL, + layer.attn_ln_1_w, + cur); + + cur = ggml_add(ctxL, + ggml_repeat(ctxL, layer.attn_ln_1_b, cur), + cur); + } + + // add the input + cur = ggml_add(ctxL, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctxL, inpFF); + + // cur = mlp_ln_w*cur + mlp_ln_b + cur = ggml_add(ctxL, + ggml_mul(ctxL, + ggml_repeat(ctxL, layer.mlp_ln_w, cur), + cur), + ggml_repeat(ctxL, layer.mlp_ln_b, cur)); + } + +#ifdef USE_FLASH_FF + cur = ggml_flash_ff(ctxL, + ggml_cpy(ctxL, cur, ggml_new_tensor_2d(ctxL, GGML_TYPE_F16, n_state, N)), + layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b); +#else + // fully connected + cur = ggml_mul_mat(ctxL, + layer.mlp_0_w, + cur); + + cur = ggml_add(ctxL, + ggml_repeat(ctxL, layer.mlp_0_b, cur), + cur); + + // GELU activation + cur = ggml_gelu(ctxL, cur); + + // projection + cur = ggml_mul_mat(ctxL, + layer.mlp_1_w, + cur); + + cur = ggml_add(ctxL, + ggml_repeat(ctxL, layer.mlp_1_b, cur), + cur); +#endif + } + + // output from this layer + struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF); + + { + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + ggml_build_forward_expand(&gf, inpO); + ggml_graph_compute (ctxL, &gf); + + //ggml_graph_print(&gf); + } + + // TODO: this is a hack to have per-layer computation graphs - need to come up with something better + // input for next layer (inpO -> inpL) + memcpy(inpL->data, inpO->data, ggml_nbytes(inpL)); + inpL->op = GGML_OP_NONE; + inpL->src0 = NULL; + inpL->src1 = NULL; + + //printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0); + + ggml_free(ctxL); + } + + cur = inpL; + + // norm + { + cur = ggml_norm(ctx0, cur); + + // cur = ln_f_g*cur + ln_f_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.e_ln_w, cur), + cur), + ggml_repeat(ctx0, model.e_ln_b, cur)); + } + + // run the computation + { + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + ggml_build_forward_expand(&gf, cur); + ggml_graph_compute (ctx0, &gf); + + //ggml_graph_print(&gf); + } + + // cur + //{ + // printf("ne0 = %d\n", cur->ne[0]); + // printf("ne1 = %d\n", cur->ne[1]); + // for (int i = 0; i < 10; ++i) { + // printf("%8.4f ", ((float *)(cur->data))[i]); + // } + // printf("... "); + // for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) { + // printf("%8.4f ", ((float *)(cur->data))[i]); + // } + // printf("\n"); + //} + + // pre-compute cross-attention memory + { + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + // TODO: hack to disconnect the encoded features from the previous graph + cur->op = GGML_OP_NONE; + cur->src0 = NULL; + cur->src1 = NULL; + + for (int il = 0; il < model.hparams.n_text_layer; ++il) { + auto & layer = model.layers_decoder[il]; + + struct ggml_tensor * Kcross = ggml_mul_mat(ctx0, + layer.cross_attn_k_w, + cur); + + Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + + struct ggml_tensor * Vcross = ggml_mul_mat(ctx0, + layer.cross_attn_v_w, + cur); + + Vcross = ggml_add(ctx0, + ggml_repeat(ctx0, + layer.cross_attn_v_b, + Vcross), + Vcross); + + //struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx)); + //struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx)); + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*n_ctx)); + struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*n_ctx)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v)); + } + + ggml_graph_compute(ctx0, &gf); + } + + //////////////////////////////////////////////////////////////////////////// + + //printf("%s: used_mem = %f MB\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0); + + ggml_free(ctx0); + + return true; +} + +// evaluate the decoder +// +// given text prompt + audio features -> predicts the probabilities for the next token +// +// - model: the model +// - n_threads: number of threads to use +// - tokens: text prompt +// - n_tokens: number of tokens in the prompt +// - n_past: number of past tokens to prefix the prompt with +// +static bool whisper_decode( + whisper_context & wctx, + const int n_threads, + const whisper_token * tokens, + const int n_tokens, + const int n_past) { + const auto & model = wctx.model; + const auto & hparams = model.hparams; + + auto & logits_out = wctx.logits; + auto & probs_out = wctx.probs; + + const int n_vocab = hparams.n_vocab; + + const int n_ctx = hparams.n_text_ctx; + const int n_state = hparams.n_text_state; + const int n_head = hparams.n_text_head; + const int n_layer = hparams.n_text_layer; + + const int N = n_tokens; + const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx; + + struct ggml_init_params params; + params.mem_size = wctx.buf_compute.size(); + params.mem_buffer = wctx.buf_compute.data(); + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, tokens, N*ggml_element_size(embd)); + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i; + } + + // token encoding + position encoding + struct ggml_tensor * cur = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.d_te, embd), + ggml_get_rows(ctx0, model.d_pe, position)); + + struct ggml_tensor * inpL = cur; + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers_decoder[il]; + + struct ggml_init_params paramsL; + paramsL.mem_size = wctx.buf_compute_layer.size(); + paramsL.mem_buffer = wctx.buf_compute_layer.data(); + + struct ggml_context * ctxL = ggml_init(paramsL); + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + // norm + { + cur = ggml_norm(ctxL, inpL); + + // cur = ln_0_w*cur + ln_0_b + cur = ggml_add(ctxL, + ggml_mul(ctxL, + ggml_repeat(ctxL, layer.attn_ln_0_w, cur), + cur), + ggml_repeat(ctxL, layer.attn_ln_0_b, cur)); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctxL, + layer.attn_q_w, + cur); + + Qcur = ggml_add(ctxL, + ggml_repeat(ctxL, + layer.attn_q_b, + Qcur), + Qcur); + + Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25))); + + // note: no bias for Key + struct ggml_tensor * Kcur = ggml_mul_mat(ctxL, + layer.attn_k_w, + cur); + + Kcur = ggml_scale(ctxL, Kcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25))); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctxL, + layer.attn_v_w, + cur); + + Vcur = ggml_add(ctxL, + ggml_repeat(ctxL, + layer.attn_v_b, + Vcur), + Vcur); + + // store key and value to memory + { + struct ggml_tensor * k = ggml_view_1d(ctxL, model.memory_k, N*n_state, (ggml_element_size(model.memory_k)*n_state)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctxL, model.memory_v, N*n_state, (ggml_element_size(model.memory_v)*n_state)*(il*n_ctx + n_past)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctxL, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctxL, Vcur, v)); + } + + // ------ + + struct ggml_tensor * Q = + ggml_permute(ctxL, + ggml_cpy(ctxL, + Qcur, + ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctxL, + ggml_reshape_3d(ctxL, + ggml_view_1d(ctxL, model.memory_k, (n_past + N)*n_state, il*n_ctx*ggml_element_size(model.memory_k)*n_state), + n_state/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q); + + //struct ggml_tensor * KQ_scaled = + // ggml_scale(ctxL, + // KQ, + // ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head)) + // ); + + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctxL, KQ, n_past); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ_masked); + + struct ggml_tensor * V_trans = + ggml_permute(ctxL, + ggml_reshape_3d(ctxL, + ggml_view_1d(ctxL, model.memory_v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(model.memory_v)*n_state), + n_state/n_head, n_head, n_past + N), + 1, 2, 0, 3); + + struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max); + + struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3); + + cur = ggml_cpy(ctxL, + KQV_merged, + ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N)); + } + + { + cur = ggml_mul_mat(ctxL, + layer.attn_ln_1_w, + cur); + + cur = ggml_add(ctxL, + ggml_repeat(ctxL, layer.attn_ln_1_b, cur), + cur); + } + + // add the input + struct ggml_tensor * inpCA = ggml_add(ctxL, cur, inpL); + + // norm + { + cur = ggml_norm(ctxL, inpCA); // note: we use inpCA here + + // cur = ln_0_w*cur + ln_0_b + cur = ggml_add(ctxL, + ggml_mul(ctxL, + ggml_repeat(ctxL, layer.cross_attn_ln_0_w, cur), + cur), + ggml_repeat(ctxL, layer.cross_attn_ln_0_b, cur)); + } + + // cross-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctxL, + layer.cross_attn_q_w, + cur); + + Qcur = ggml_add(ctxL, + ggml_repeat(ctxL, + layer.cross_attn_q_b, + Qcur), + Qcur); + + Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25))); + + // Kcross is already scaled + struct ggml_tensor * Kcross = + ggml_reshape_3d(ctxL, + ggml_view_1d(ctxL, model.memory_cross_k, M*n_state, il*M*ggml_element_size(model.memory_cross_k)*n_state), + n_state/n_head, n_head, M); + + struct ggml_tensor * Vcross = + ggml_reshape_3d(ctxL, + ggml_view_1d(ctxL, model.memory_cross_v, M*n_state, il*M*ggml_element_size(model.memory_cross_v)*n_state), + n_state/n_head, n_head, M); + + // ------ + + struct ggml_tensor * Q = + ggml_permute(ctxL, + ggml_cpy(ctxL, + Qcur, + ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)), + 0, 2, 1, 3); + + struct ggml_tensor * K = ggml_permute(ctxL, Kcross, 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q); + + //struct ggml_tensor * KQ_scaled = + // ggml_scale(ctxL, + // KQ, + // ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head)) + // ); + + // no masking for cross-attention + //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctxL, KQ_scaled, n_past); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ); + + struct ggml_tensor * V_trans = ggml_permute(ctxL, Vcross, 1, 2, 0, 3); + + struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max); + + struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_state, N) + cur = ggml_cpy(ctxL, + KQV_merged, + ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N)); + } + + // projection + { + cur = ggml_mul_mat(ctxL, + layer.cross_attn_ln_1_w, + cur); + + cur = ggml_add(ctxL, + ggml_repeat(ctxL, layer.cross_attn_ln_1_b, cur), + cur); + } + + // add the input + cur = ggml_add(ctxL, cur, inpCA); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctxL, inpFF); + + // cur = mlp_ln_w*cur + mlp_ln_b + cur = ggml_add(ctxL, + ggml_mul(ctxL, + ggml_repeat(ctxL, layer.mlp_ln_w, cur), + cur), + ggml_repeat(ctxL, layer.mlp_ln_b, cur)); + } + + // fully connected + cur = ggml_mul_mat(ctxL, + layer.mlp_0_w, + cur); + + cur = ggml_add(ctxL, + ggml_repeat(ctxL, layer.mlp_0_b, cur), + cur); + + // GELU activation + cur = ggml_gelu(ctxL, cur); + + // projection + cur = ggml_mul_mat(ctxL, + layer.mlp_1_w, + cur); + + cur = ggml_add(ctxL, + ggml_repeat(ctxL, layer.mlp_1_b, cur), + cur); + } + + // output from this layer + struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF); + + { + ggml_build_forward_expand(&gf, inpO); + ggml_graph_compute (ctxL, &gf); + + //ggml_graph_print(&gf); + } + + // TODO: this is a hack to have per-layer computation graphs - need to come up with something better + // input for next layer (inpO -> inpL) + memcpy(inpL->data, inpO->data, ggml_nbytes(inpL)); + inpL->op = GGML_OP_NONE; + inpL->src0 = NULL; + inpL->src1 = NULL; + + if (N > 1) { + //printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0); + } + + ggml_free(ctxL); + } + + cur = inpL; + + // norm + { + cur = ggml_norm(ctx0, cur); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.d_ln_w, cur), + cur), + ggml_repeat(ctx0, model.d_ln_b, cur)); + } + + struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur); + + // logits -> probs + cur = ggml_dup(ctx0, logits); + cur = ggml_soft_max(ctx0, cur); // in-place + + // run the computation + { + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + ggml_build_forward_expand(&gf, cur); + ggml_graph_compute (ctx0, &gf); + } + + logits_out.resize(N*n_vocab); + memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab); + + probs_out.resize(N*n_vocab); + memcpy(probs_out.data(), ggml_get_data(cur), sizeof(float)*N*n_vocab); + + if (N > 1) { + //const float mem_per_token = ggml_used_mem(ctx0)/1024.0/1024.0/N; + //printf("%s: used_mem = %f MB / %f per token\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0, mem_per_token); + //printf("%s: max mem = %f MB\n", __func__, mem_per_token*model.hparams.n_text_ctx); + } + + ggml_free(ctx0); + + return true; +} + +// the most basic sampling scheme - select the top token +static whisper_token_data whisper_sample_best( + const whisper_vocab & vocab, + const float * probs, + bool force_timestamp, + bool is_initial) { + whisper_token_data result = { + 0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f, + }; + + int n_logits = vocab.id_to_token.size(); + + std::vector> probs_id; + probs_id.reserve(n_logits); + + for (int i = 0; i < n_logits; i++) { + probs_id.push_back(std::make_pair(probs[i], i)); + } + + { + double sum_ts = 0.0; + double max_ts = -1.0; + double max_tx = -1.0; + + for (int i = 0; i < vocab.token_beg; i++) { + max_tx = std::max(max_tx, probs_id[i].first); + } + + const auto i0 = is_initial ? vocab.token_beg + 101 : vocab.token_beg; + const auto i1 = is_initial ? vocab.token_beg + 101 : n_logits; + + // the initial timestamp cannot be larger than 100 + // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429 + if (is_initial) { + for (int i = i0; i < n_logits; ++ i) { + probs_id[i].first = -INFINITY; + } + } + + for (int i = vocab.token_beg; i < i1; i++) { + sum_ts += probs_id[i].first; + if (probs_id[i].first > max_ts) { + max_ts = probs_id[i].first; + result.tid = probs_id[i].second; + } + } + + // if the probability sum of all timestamp tokens is higher than the max probability of the text tokens - sample a + // timestamp token + if (sum_ts > max_tx || force_timestamp) { + // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L430-L438 + for (int i = 0; i < vocab.token_beg; i++) { + probs_id[i].first = -INFINITY; + } + } + + result.pt = max_ts/(sum_ts + 1e-10); + result.ptsum = sum_ts; + } + + // find the top K tokens + const int top_k = 4; + + std::partial_sort( + probs_id.begin(), + probs_id.begin() + top_k, probs_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + probs_id.resize(top_k); + + //printf("\n"); + //for (int i = 0; i < (int) probs_id.size(); i++) { + // printf("%d: '%s' %f, %d\n", i, vocab.id_to_token.at(probs_id[i].second).c_str(), probs_id[i].first, probs_id[i].second); + //} + + int res = 0; + while ((probs_id[res].second == vocab.token_sot || + probs_id[res].second == vocab.token_solm || + probs_id[res].second == vocab.token_not) && + res < (int) probs_id.size() - 1) { + res++; + } + + result.id = probs_id[res].second; + result.p = probs_id[res].first; + + return result; +} + +// 500 -> 00:05.000 +// 6000 -> 01:00.000 +static std::string to_timestamp(int64_t t, bool comma = false) { + int64_t msec = t * 10; + int64_t hr = msec / (1000 * 60 * 60); + msec = msec - hr * (1000 * 60 * 60); + int64_t min = msec / (1000 * 60); + msec = msec - min * (1000 * 60); + int64_t sec = msec / 1000; + msec = msec - sec * 1000; + + char buf[32]; + snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec); + + return std::string(buf); +} + +// naive Discrete Fourier Transform +// input is real-valued +// output is complex-valued +static void dft(const std::vector & in, std::vector & out) { + int N = in.size(); + + out.resize(N*2); + + for (int k = 0; k < N; k++) { + float re = 0; + float im = 0; + + for (int n = 0; n < N; n++) { + float angle = 2*M_PI*k*n/N; + re += in[n]*cos(angle); + im -= in[n]*sin(angle); + } + + out[k*2 + 0] = re; + out[k*2 + 1] = im; + } +} + +// Cooley-Tukey FFT +// poor man's implementation - use something better +// input is real-valued +// output is complex-valued +static void fft(const std::vector & in, std::vector & out) { + out.resize(in.size()*2); + + int N = in.size(); + + if (N == 1) { + out[0] = in[0]; + out[1] = 0; + return; + } + + if (N%2 == 1) { + dft(in, out); + return; + } + + std::vector even; + std::vector odd; + + for (int i = 0; i < N; i++) { + if (i % 2 == 0) { + even.push_back(in[i]); + } else { + odd.push_back(in[i]); + } + } + + std::vector even_fft; + std::vector odd_fft; + + fft(even, even_fft); + fft(odd, odd_fft); + + for (int k = 0; k < N/2; k++) { + float theta = 2*M_PI*k/N; + + float re = cos(theta); + float im = -sin(theta); + + float re_odd = odd_fft[2*k + 0]; + float im_odd = odd_fft[2*k + 1]; + + out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd; + out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd; + + out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; + out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; + } +} + +// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124 +static bool log_mel_spectrogram( + const float * samples, + const int n_samples, + const int sample_rate, + const int fft_size, + const int fft_step, + const int n_mel, + const int n_threads, + const whisper_filters & filters, + const bool speed_up, + whisper_mel & mel) { + + // Hanning window + std::vector hann; + hann.resize(fft_size); + for (int i = 0; i < fft_size; i++) { + hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size))); + } + + mel.n_mel = n_mel; + mel.n_len = (n_samples)/fft_step; + mel.data.resize(mel.n_mel*mel.n_len); + + const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2); + + //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len); + //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate); + + std::vector workers(n_threads); + for (int iw = 0; iw < n_threads; ++iw) { + workers[iw] = std::thread([&](int ith) { + std::vector fft_in; + fft_in.resize(fft_size); + for (int i = 0; i < fft_size; i++) { + fft_in[i] = 0.0; + } + + std::vector fft_out; + fft_out.resize(2*fft_size); + + for (int i = ith; i < mel.n_len; i += n_threads) { + const int offset = i*fft_step; + + // apply Hanning window + for (int j = 0; j < fft_size; j++) { + if (offset + j < n_samples) { + fft_in[j] = hann[j]*samples[offset + j]; + } else { + fft_in[j] = 0.0; + } + } + + // FFT -> mag^2 + fft(fft_in, fft_out); + + for (int j = 0; j < fft_size; j++) { + fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]); + } + for (int j = 1; j < fft_size/2; j++) { + //if (i == 0) { + // printf("%d: %f %f\n", j, fft_out[j], fft_out[fft_size - j]); + //} + fft_out[j] += fft_out[fft_size - j]; + } + if (i == 0) { + //for (int j = 0; j < fft_size; j++) { + // printf("%d: %e\n", j, fft_out[j]); + //} + } + + if (speed_up) { + // scale down in the frequency domain results in a speed up in the time domain + for (int j = 0; j < n_fft; j++) { + fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]); + } + } + + // mel spectrogram + for (int j = 0; j < mel.n_mel; j++) { + double sum = 0.0; + + for (int k = 0; k < n_fft; k++) { + sum += fft_out[k]*filters.data[j*n_fft + k]; + } + if (sum < 1e-10) { + sum = 1e-10; + } + + sum = log10(sum); + + mel.data[j*mel.n_len + i] = sum; + } + } + }, iw); + } + + for (int iw = 0; iw < n_threads; ++iw) { + workers[iw].join(); + } + + // clamping and normalization + double mmax = -1e20; + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] > mmax) { + mmax = mel.data[i]; + } + } + //printf("%s: max = %f\n", __func__, mmax); + + mmax -= 8.0; + + for (int i = 0; i < mel.n_mel*mel.n_len; i++) { + if (mel.data[i] < mmax) { + mel.data[i] = mmax; + } + + mel.data[i] = (mel.data[i] + 4.0)/4.0; + } + + return true; +} + +// +// interface implementation +// + +struct whisper_context * whisper_init(const char * path_model) { + ggml_time_init(); + + whisper_context * ctx = new whisper_context; + + const int64_t t_start_us = ggml_time_us(); + + ctx->t_start_us = t_start_us; + + if (!whisper_model_load(path_model, *ctx)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model); + return NULL; + } + + ctx->t_load_us = ggml_time_us() - t_start_us; + + return ctx; +} + +void whisper_free(struct whisper_context * ctx) { + if (ctx) { + if (ctx->model.ctx) { + ggml_free(ctx->model.ctx); + } + if (ctx->model.ctx_mem) { + ggml_free(ctx->model.ctx_mem); + } + if (ctx->buf_model) { + delete ctx->buf_model; + } + delete ctx; + } +} + +int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { + const int64_t t_start_us = ggml_time_us(); + + if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, ctx->mel)) { + fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__); + return -1; + } + + ctx->t_mel_us = ggml_time_us() - t_start_us; + + return 0; +} + +// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 +int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { + const int64_t t_start_us = ggml_time_us(); + + if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, 2*WHISPER_N_FFT, 2*WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, ctx->mel)) { + fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__); + return -1; + } + + ctx->t_mel_us = ggml_time_us() - t_start_us; + + return 0; +} + +int whisper_set_mel( + struct whisper_context * ctx, + const float * data, + int n_len, + int n_mel) { + if (n_mel != WHISPER_N_MEL) { + fprintf(stderr, "%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL); + return -1; + } + + ctx->mel.n_len = n_len; + ctx->mel.n_mel = n_mel; + + ctx->mel.data.resize(n_len*n_mel); + memcpy(ctx->mel.data.data(), data, n_len*n_mel*sizeof(float)); + + return 0; +} + +int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) { + const int64_t t_start_us = ggml_time_us(); + + if (!whisper_encode(*ctx, n_threads, offset)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return -1; + } + + ctx->t_encode_us += ggml_time_us() - t_start_us; + + return 0; +} + +int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) { + const int64_t t_start_us = ggml_time_us(); + + if (!whisper_decode(*ctx, n_threads, tokens, n_tokens, n_past)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + + ctx->t_decode_us += ggml_time_us() - t_start_us; + + return 0; +} + +struct whisper_token_data whisper_sample_best(struct whisper_context * ctx) { + const int64_t t_start_sample_us = ggml_time_us(); + + const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), false, false); + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + + return res; +} + +struct whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial) { + const int64_t t_start_sample_us = ggml_time_us(); + + const auto res = whisper_sample_best(ctx->vocab, ctx->probs.data() + (ctx->probs.size() - ctx->vocab.n_vocab), true, is_initial); + + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + + return res; +} + +int whisper_lang_id(const char * lang) { + if (!g_lang.count(lang)) { + fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang); + return -1; + } + + return g_lang.at(lang).first; +} + +int whisper_n_len(struct whisper_context * ctx) { + return ctx->mel.n_len; +} + +int whisper_n_vocab(struct whisper_context * ctx) { + return ctx->vocab.n_vocab; +} + +int whisper_n_text_ctx(struct whisper_context * ctx) { + return ctx->model.hparams.n_text_ctx; +} + +int whisper_is_multilingual(struct whisper_context * ctx) { + return ctx->vocab.is_multilingual() ? 1 : 0; +} + +float * whisper_get_probs(struct whisper_context * ctx) { + return ctx->probs.data(); +} + +const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token) { + return ctx->vocab.id_to_token.at(token).c_str(); +} + +whisper_token whisper_token_eot(struct whisper_context * ctx) { + return ctx->vocab.token_eot; +} + +whisper_token whisper_token_sot(struct whisper_context * ctx) { + return ctx->vocab.token_sot; +} + +whisper_token whisper_token_prev(struct whisper_context * ctx) { + return ctx->vocab.token_prev; +} + +whisper_token whisper_token_solm(struct whisper_context * ctx) { + return ctx->vocab.token_solm; +} + +whisper_token whisper_token_not(struct whisper_context * ctx) { + return ctx->vocab.token_not; +} + +whisper_token whisper_token_beg(struct whisper_context * ctx) { + return ctx->vocab.token_beg; +} + +whisper_token whisper_token_translate(void) { + return whisper_vocab::token_translate; +} + +whisper_token whisper_token_transcribe(void) { + return whisper_vocab::token_transcribe; +} + +void whisper_print_timings(struct whisper_context * ctx) { + const int64_t t_end_us = ggml_time_us(); + + fprintf(stderr, "\n"); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f); + fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f); + fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f); + fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer); + fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer); + fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f); +} + +void whisper_reset_timings(struct whisper_context * ctx) { + ctx->t_sample_us = 0; + ctx->t_encode_us = 0; + ctx->t_decode_us = 0; +} + +const char * whisper_print_system_info(void) { + static std::string s; + + s = ""; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + + return s.c_str(); +} + +//////////////////////////////////////////////////////////////////////////// + +struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) { + struct whisper_full_params result; + + switch (strategy) { + case WHISPER_SAMPLING_GREEDY: + { + result = { + /*.strategy =*/ WHISPER_SAMPLING_GREEDY, + + /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()), + /*.n_max_text_ctx =*/ 16384, + /*.offset_ms =*/ 0, + /*.duration_ms =*/ 0, + + /*.translate =*/ false, + /*.no_context =*/ false, + /*.single_segment =*/ false, + /*.print_special =*/ false, + /*.print_progress =*/ true, + /*.print_realtime =*/ false, + /*.print_timestamps =*/ true, + + /*.token_timestamps =*/ false, + /*.thold_pt =*/ 0.01f, + /*.thold_ptsum =*/ 0.01f, + /*.max_len =*/ 0, + /*.max_tokens =*/ 0, + + /*.speed_up =*/ false, + /*.audio_ctx =*/ 0, + + /*.prompt_tokens =*/ nullptr, + /*.prompt_n_tokens =*/ 0, + + /*.language =*/ "en", + + /*.greedy =*/ { + /*.n_past =*/ 0, + }, + + /*.beam_search =*/ { + /*.n_past =*/ -1, + /*.beam_width =*/ -1, + /*.n_best =*/ -1, + }, + + /*.new_segment_callback =*/ nullptr, + /*.new_segment_callback_user_data =*/ nullptr, + + /*.encoder_begin_callback =*/ nullptr, + /*.encoder_begin_callback_user_data =*/ nullptr, + }; + } break; + case WHISPER_SAMPLING_BEAM_SEARCH: + { + result = { + /*.strategy =*/ WHISPER_SAMPLING_BEAM_SEARCH, + + /*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()), + /*.n_max_text_ctx =*/ 16384, + /*.offset_ms =*/ 0, + /*.duration_ms =*/ 0, + + /*.translate =*/ false, + /*.no_context =*/ false, + /*.single_segment =*/ false, + /*.print_special =*/ false, + /*.print_progress =*/ true, + /*.print_realtime =*/ false, + /*.print_timestamps =*/ true, + + /*.token_timestamps =*/ false, + /*.thold_pt =*/ 0.01f, + /*.thold_ptsum =*/ 0.01f, + /*.max_len =*/ 0, + /*.max_tokens =*/ 0, + + /*.speed_up =*/ false, + /*.audio_ctx =*/ 0, + + /*.prompt_tokens =*/ nullptr, + /*.prompt_n_tokens =*/ 0, + + /*.language =*/ "en", + + /*.greedy =*/ { + /*.n_past =*/ -1, + }, + + /*.beam_search =*/ { + /*.n_past =*/ 0, + /*.beam_width =*/ 10, + /*.n_best =*/ 5, + }, + + /*.new_segment_callback =*/ nullptr, + /*.new_segment_callback_user_data =*/ nullptr, + + /*.encoder_begin_callback =*/ nullptr, + /*.encoder_begin_callback_user_data =*/ nullptr, + }; + } break; + } + + return result; +} + +// forward declarations +static std::vector get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window); +static void whisper_exp_compute_token_level_timestamps( + struct whisper_context * ctx, + int i_segment, + float thold_pt, + float thold_ptsum); + +// wrap the last segment to max_len characters +// returns the number of new segments +static int whisper_wrap_segment(struct whisper_context * ctx, int max_len) { + auto segment = ctx->result_all.back(); + + int res = 1; + int acc = 0; + + std::string text; + + for (int i = 0; i < (int) segment.tokens.size(); i++) { + const auto & token = segment.tokens[i]; + if (token.id >= whisper_token_eot(ctx)) { + continue; + } + + const auto txt = whisper_token_to_str(ctx, token.id); + + const int cur = strlen(txt); + + if (acc + cur > max_len && i > 0) { + // split here + ctx->result_all.back().text = std::move(text); + ctx->result_all.back().t1 = token.t0; + ctx->result_all.back().tokens.resize(i); + + ctx->result_all.push_back({}); + ctx->result_all.back().t0 = token.t0; + ctx->result_all.back().t1 = segment.t1; + + // add tokens [i, end] to the new segment + ctx->result_all.back().tokens.insert( + ctx->result_all.back().tokens.end(), + segment.tokens.begin() + i, + segment.tokens.end()); + + acc = 0; + text = ""; + + segment = ctx->result_all.back(); + i = -1; + + res++; + } else { + acc += cur; + text += txt; + } + } + + ctx->result_all.back().text = std::move(text); + + return res; +} + +int whisper_full( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples) { + // clear old results + auto & result_all = ctx->result_all; + + result_all.clear(); + + // compute log mel spectrogram + if (params.speed_up) { + if (whisper_pcm_to_mel_phase_vocoder(ctx, samples, n_samples, params.n_threads) != 0) { + fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__); + return -1; + } + } else { + if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) { + fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__); + return -1; + } + } + + if (params.token_timestamps) { + ctx->t_beg = 0; + ctx->t_last = 0; + ctx->tid_last = 0; + ctx->energy = get_signal_energy(samples, n_samples, 32); + } + + const int seek_start = params.offset_ms/10; + const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10); + + // if length of spectrogram is less than 1s (100 samples), then return + // basically don't process anything that is less than 1s + // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39 + if (seek_end < 100 + seek_start) { + return 0; + } + + // the accumulated text context so far + auto & prompt_past = ctx->prompt_past; + if (params.no_context) { + prompt_past.clear(); + } + + // prepend the prompt tokens to the prompt_past + if (params.prompt_tokens && params.prompt_n_tokens > 0) { + // parse tokens from the pointer + for (int i = 0; i < params.prompt_n_tokens; i++) { + prompt_past.push_back(params.prompt_tokens[i]); + } + std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end()); + } + + // overwrite audio_ctx + ctx->exp_n_audio_ctx = params.audio_ctx; + + // these tokens determine the task that will be performed + std::vector prompt_init = { whisper_token_sot(ctx) }; + if (whisper_is_multilingual(ctx)) { + prompt_init.push_back(whisper_token_sot(ctx) + 1 + whisper_lang_id(params.language)); + if (params.translate) { + prompt_init.push_back(whisper_token_translate()); + } else { + prompt_init.push_back(whisper_token_transcribe()); + } + } + + int progress_prev = 0; + int progress_step = 5; + + std::vector tokens_cur; + tokens_cur.reserve(whisper_n_text_ctx(ctx)); + + std::vector prompt; + prompt.reserve(whisper_n_text_ctx(ctx)); + + // main loop + int seek = seek_start; + while (true) { + const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start); + while (progress_cur >= progress_prev + progress_step) { + progress_prev += progress_step; + if (params.print_progress) { + fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev); + } + } + + if (seek + 100 >= seek_end) { + break; + } + + if (params.encoder_begin_callback) { + if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) { + fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__); + break; + } + } + + // encode audio features starting at offset seek + if (whisper_encode(ctx, seek, params.n_threads) != 0) { + fprintf(stderr, "%s: failed to encode\n", __func__); + return 7; + } + + int n_past = 0; + prompt.clear(); + + // if we have already generated some text, use it as a prompt to condition the next generation + if (prompt_past.size() > 0) { + int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size())); + + prompt = { whisper_token_prev(ctx) }; + prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end()); + + prompt_past.clear(); + prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end()); + } + + prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end()); + + int seek_delta = 100*WHISPER_CHUNK_SIZE; + + // print the prompt + //printf("\n\n"); + //for (int i = 0; i < prompt.size(); i++) { + // printf("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token[prompt[i]].c_str()); + //} + //printf("\n\n"); + + // the accumulated transcription in the current interation + int result_len = 0; + tokens_cur.clear(); + + bool failed = false; + bool has_ts = false; // have we already sampled a non-beg timestamp token for the current segment? + + for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) { + if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) { + fprintf(stderr, "%s: failed to decode\n", __func__); + return 8; + } + + n_past += prompt.size(); + prompt.clear(); + + // very basic greedy sampling strategy: + // + // - always take the most probable token + // + // more sophisticated sampling strategies could be implemented here, but we keep it simple + // feel free to experiment! + // + { + const auto token = (i == 0) ? whisper_sample_timestamp(ctx, true) : whisper_sample_best(ctx); + + // timestamp token - update sliding window + if (token.id > whisper_token_beg(ctx)) { + const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx)); + + // do not allow to go back in time + if (has_ts && seek_delta > seek_delta_new && result_len < i) { + break; + } + + seek_delta = seek_delta_new; + result_len = i + 1; + has_ts = true; + } + + // add it to the context + prompt.push_back(token.id); + tokens_cur.push_back(token); + + //{ + // const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token[token.tid] : "[?]"; + // printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str()); + //} + + // end of segment + if (token.id == whisper_token_eot(ctx) || // end of text token + (params.max_tokens > 0 && i > params.max_tokens) || // max tokens per segment reached + (has_ts && seek + seek_delta + 100 >= seek_end) // end of audio reached + ) { + if (result_len == 0) { + if (seek + seek_delta + 100 >= seek_end) { + result_len = i + 1; + } else { + failed = true; + break; + } + } + + if (params.single_segment) { + result_len = i + 1; + seek_delta = 100*WHISPER_CHUNK_SIZE; + } + + break; + } + + // TESTS: if no tensors are loaded, it means we are running tests + if (ctx->model.n_loaded == 0) { + seek_delta = 100*WHISPER_CHUNK_SIZE; + break; + } + } + + // sometimes, the decoding can get stuck in a repetition loop + // this is a simple strategy to avoid such cases - we simply flag the decoding as failed and advance + // the sliding window by 1 second + if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) { + failed = true; + break; + } + } + + if (failed) { + fprintf(stderr, "\n%s: failed to generate timestamp token - using fallback strategy\n\n", __func__); + seek += 100; + continue; + } + + // shrink down to result_len + tokens_cur.resize(result_len); + + for (const auto & r : tokens_cur) { + prompt_past.push_back(r.id); + } + + // store the text from this iteration + if (tokens_cur.size() > 0) { + int i0 = 0; + auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx)); + + std::string text = ""; + + for (int i = 0; i < (int) tokens_cur.size(); i++) { + //printf("%s: %18s %6.3f %18s %6.3f\n", __func__, + // ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p, + // ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt); + + if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) { + } else { + text += whisper_token_to_str(ctx, tokens_cur[i].id); + } + if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) { + const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx)); + if (!text.empty()) { + const auto tt0 = params.speed_up ? 2*t0 : t0; + const auto tt1 = params.speed_up ? 2*t1 : t1; + + if (params.print_realtime) { + if (params.print_timestamps) { + printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str()); + } else { + printf("%s", text.c_str()); + fflush(stdout); + } + } + + result_all.push_back({ tt0, tt1, text, {} }); + for (int j = i0; j <= i; j++) { + result_all.back().tokens.push_back(tokens_cur[j]); + } + + int n_new = 1; + + if (params.token_timestamps) { + whisper_exp_compute_token_level_timestamps( + ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum); + + if (params.max_len > 0) { + n_new = whisper_wrap_segment(ctx, params.max_len); + } + } + if (params.new_segment_callback) { + params.new_segment_callback(ctx, n_new, params.new_segment_callback_user_data); + } + } + text = ""; + while (i < (int) tokens_cur.size() && tokens_cur[i].id > whisper_token_beg(ctx)) { + i++; + } + i--; + t0 = t1; + i0 = i + 1; + } + } + + if (!text.empty()) { + const auto t1 = seek + seek_delta; + + const auto tt0 = params.speed_up ? 2*t0 : t0; + const auto tt1 = params.speed_up ? 2*t1 : t1; + + if (params.print_realtime) { + if (params.print_timestamps) { + printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str()); + } else { + printf("%s", text.c_str()); + fflush(stdout); + } + } + + result_all.push_back({ tt0, tt1, text, {} }); + for (int j = i0; j < (int) tokens_cur.size(); j++) { + result_all.back().tokens.push_back(tokens_cur[j]); + } + + int n_new = 1; + + if (params.token_timestamps) { + whisper_exp_compute_token_level_timestamps( + ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum); + + if (params.max_len > 0) { + n_new = whisper_wrap_segment(ctx, params.max_len); + } + } + if (params.new_segment_callback) { + params.new_segment_callback(ctx, n_new, params.new_segment_callback_user_data); + } + } + } + + seek += seek_delta; + } + + return 0; +} + +int whisper_full_parallel( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples, + int n_processors) { + if (n_processors == 1) { + return whisper_full(ctx, params, samples, n_samples); + } + + int ret = 0; + + // prepare separate contexts for each thread + std::vector ctxs(n_processors - 1); + + for (int i = 0; i < n_processors - 1; ++i) { + ctxs[i] = *ctx; + + auto & model = ctxs[i].model; + + // create the ggml memory context + { + struct ggml_init_params params; + params.mem_size = ctxs[i].buf_memory.size(); + params.mem_buffer = ctxs[i].buf_memory.data(); + + model.ctx_mem = ggml_init(params); + if (!model.ctx_mem) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // separate key + value memory for each processor + { + auto & ctx = model.ctx_mem; + + const auto & hparams = model.hparams; + + const int n_text_state = hparams.n_text_state; + const int n_text_layer = hparams.n_text_layer; + const int n_text_ctx = hparams.n_text_ctx; + + // key/value memory for the self-attention layer + { + const int n_mem = n_text_layer*n_text_ctx; + const int n_elements = n_text_state*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + } + + // key/value memory for the cross-attention layer + { + const int n_audio_ctx = hparams.n_audio_ctx; + + const int n_mem = n_text_layer*n_audio_ctx; + const int n_elements = n_text_state*n_mem; + + model.memory_cross_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_cross_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + } + } + } + + const int offset_samples = (WHISPER_SAMPLE_RATE*params.offset_ms)/1000; + const int n_samples_per_processor = (n_samples - offset_samples)/n_processors; + + // the calling thread will process the first chunk + // while the other threads will process the remaining chunks + + std::vector workers(n_processors - 1); + for (int i = 0; i < n_processors - 1; ++i) { + const int start_samples = offset_samples + (i + 1)*n_samples_per_processor; + const int n_samples_cur = (i == n_processors - 2) ? n_samples - start_samples : n_samples_per_processor; + + auto params_cur = params; + + params_cur.offset_ms = 0; + params_cur.print_progress = false; + params_cur.print_realtime = false; + + params_cur.new_segment_callback = nullptr; + params_cur.new_segment_callback_user_data = nullptr; + + workers[i] = std::thread(whisper_full, &ctxs[i], std::move(params_cur), samples + start_samples, n_samples_cur); + } + + { + auto params_cur = params; + + ret = whisper_full(ctx, std::move(params_cur), samples, offset_samples + n_samples_per_processor); + } + + for (int i = 0; i < n_processors - 1; ++i) { + workers[i].join(); + } + + const int64_t offset_t = (int64_t) params.offset_ms/10.0; + + // combine results into ctx->result_all + for (int i = 0; i < n_processors - 1; ++i) { + auto & results_i = ctxs[i].result_all; + + for (int j = 0; j < (int) results_i.size(); ++j) { + // correct the segment timestamp taking into account the offset + results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t; + results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t; + + // make sure that segments are not overlapping + if (ctx->result_all.size() > 0) { + results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1); + } + + ctx->result_all.push_back(std::move(results_i[j])); + + // call the new_segment_callback for each segment + if (params.new_segment_callback) { + params.new_segment_callback(ctx, 1, params.new_segment_callback_user_data); + } + } + + ctx->t_mel_us += ctxs[i].t_mel_us; + ctx->t_sample_us += ctxs[i].t_sample_us; + ctx->t_encode_us += ctxs[i].t_encode_us; + ctx->t_decode_us += ctxs[i].t_decode_us; + } + + // average the timings + ctx->t_mel_us /= n_processors; + ctx->t_sample_us /= n_processors; + ctx->t_encode_us /= n_processors; + ctx->t_decode_us /= n_processors; + + // print information about the audio boundaries + fprintf(stderr, "\n"); + fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors); + for (int i = 0; i < n_processors - 1; ++i) { + fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str()); + } + fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__); + + return ret; +} + +int whisper_full_n_segments(struct whisper_context * ctx) { + return ctx->result_all.size(); +} + +int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) { + return ctx->result_all[i_segment].t0; +} + +int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) { + return ctx->result_all[i_segment].t1; +} + +const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment) { + return ctx->result_all[i_segment].text.c_str(); +} + +int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment) { + return ctx->result_all[i_segment].tokens.size(); +} + +const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token) { + return ctx->vocab.id_to_token[ctx->result_all[i_segment].tokens[i_token].id].c_str(); +} + +whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segment, int i_token) { + return ctx->result_all[i_segment].tokens[i_token].id; +} + +struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) { + return ctx->result_all[i_segment].tokens[i_token]; +} + +float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token) { + return ctx->result_all[i_segment].tokens[i_token].p; +} + +// ================================================================================================= + +// +// Experimental stuff below +// +// Not sure if these should be part of the library at all, because the quality of the results is not +// guaranteed. Might get removed at some point unless a robust algorithm implementation is found +// + +// ================================================================================================= + +// +// token-level timestamps +// + +static int timestamp_to_sample(int64_t t, int n_samples) { + return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100))); +} + +static int64_t sample_to_timestamp(int i_sample) { + return (100*i_sample)/WHISPER_SAMPLE_RATE; +} + +// a cost-function / heuristic that is high for text that takes longer to pronounce +// obviously, can be improved +static float voice_length(const std::string & text) { + float res = 0.0f; + + for (size_t i = 0; i < text.size(); ++i) { + if (text[i] == ' ') { + res += 0.01f; + } else if (text[i] == ',') { + res += 2.00f; + } else if (text[i] == '.') { + res += 3.00f; + } else if (text[i] == '!') { + res += 3.00f; + } else if (text[i] == '?') { + res += 3.00f; + } else if (text[i] >= '0' && text[i] <= '9') { + res += 3.00f; + } else { + res += 1.00f; + } + } + + return res; +} + +// average the fabs of the signal +static std::vector get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window) { + const int hw = n_samples_per_half_window; + + std::vector result(n_samples); + + for (int i = 0; i < n_samples; i++) { + float sum = 0; + for (int j = -hw; j <= hw; j++) { + if (i + j >= 0 && i + j < n_samples) { + sum += fabs(signal[i + j]); + } + } + result[i] = sum/(2*hw + 1); + } + + return result; +} + +static void whisper_exp_compute_token_level_timestamps( + struct whisper_context * ctx, + int i_segment, + float thold_pt, + float thold_ptsum) { + auto & segment = ctx->result_all[i_segment]; + auto & tokens = segment.tokens; + + const int n_samples = ctx->energy.size(); + + if (n_samples == 0) { + fprintf(stderr, "%s: no signal data available\n", __func__); + return; + } + + const int64_t t0 = segment.t0; + const int64_t t1 = segment.t1; + + const int n = tokens.size(); + + if (n == 0) { + return; + } + + if (n == 1) { + tokens[0].t0 = t0; + tokens[0].t1 = t1; + + return; + } + + auto & t_beg = ctx->t_beg; + auto & t_last = ctx->t_last; + auto & tid_last = ctx->tid_last; + + for (int j = 0; j < n; ++j) { + auto & token = tokens[j]; + + if (j == 0) { + if (token.id == whisper_token_beg(ctx)) { + tokens[j ].t0 = t0; + tokens[j ].t1 = t0; + tokens[j + 1].t0 = t0; + + t_beg = t0; + t_last = t0; + tid_last = whisper_token_beg(ctx); + } else { + tokens[j ].t0 = t_last; + } + } + + const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(ctx)); + + tokens[j].id = token.id; + tokens[j].tid = token.tid; + tokens[j].p = token.p; + tokens[j].pt = token.pt; + tokens[j].ptsum = token.ptsum; + + tokens[j].vlen = voice_length(whisper_token_to_str(ctx, token.id)); + + if (token.pt > thold_pt && token.ptsum > thold_ptsum && token.tid > tid_last && tt <= t1) { + if (j > 0) { + tokens[j - 1].t1 = tt; + } + tokens[j].t0 = tt; + tid_last = token.tid; + } + } + + tokens[n - 2].t1 = t1; + tokens[n - 1].t0 = t1; + tokens[n - 1].t1 = t1; + + t_last = t1; + + // find intervals of tokens with unknown timestamps + // fill the timestamps by proportionally splitting the interval based on the token voice lengths + { + int p0 = 0; + int p1 = 0; + + while (true) { + while (p1 < n && tokens[p1].t1 < 0) { + p1++; + } + + if (p1 >= n) { + p1--; + } + + if (p1 > p0) { + double psum = 0.0; + for (int j = p0; j <= p1; j++) { + psum += tokens[j].vlen; + } + + //printf("analyzing %d - %d, psum = %f\n", p0, p1, psum); + + const double dt = tokens[p1].t1 - tokens[p0].t0; + + // split the time proportionally to the voice length + for (int j = p0 + 1; j <= p1; j++) { + const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum; + + tokens[j - 1].t1 = ct; + tokens[j ].t0 = ct; + } + } + + p1++; + p0 = p1; + if (p1 >= n) { + break; + } + } + } + + // fix up (just in case) + for (int j = 0; j < n - 1; j++) { + if (tokens[j].t1 < 0) { + tokens[j + 1].t0 = tokens[j].t1; + } + + if (j > 0) { + if (tokens[j - 1].t1 > tokens[j].t0) { + tokens[j].t0 = tokens[j - 1].t1; + tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1); + } + } + } + + // VAD + // expand or contract tokens based on voice activity + { + const int hw = WHISPER_SAMPLE_RATE/8; + + for (int j = 0; j < n; j++) { + if (tokens[j].id >= whisper_token_eot(ctx)) { + continue; + } + + int s0 = timestamp_to_sample(tokens[j].t0, n_samples); + int s1 = timestamp_to_sample(tokens[j].t1, n_samples); + + const int ss0 = std::max(s0 - hw, 0); + const int ss1 = std::min(s1 + hw, n_samples); + + const int ns = ss1 - ss0; + + float sum = 0.0f; + + for (int k = ss0; k < ss1; k++) { + sum += ctx->energy[k]; + } + + const float thold = 0.5*sum/ns; + + { + int k = s0; + if (ctx->energy[k] > thold && j > 0) { + while (k > 0 && ctx->energy[k] > thold) { + k--; + } + tokens[j].t0 = sample_to_timestamp(k); + if (tokens[j].t0 < tokens[j - 1].t1) { + tokens[j].t0 = tokens[j - 1].t1; + } else { + s0 = k; + } + } else { + while (ctx->energy[k] < thold && k < s1) { + k++; + } + s0 = k; + tokens[j].t0 = sample_to_timestamp(k); + } + } + + { + int k = s1; + if (ctx->energy[k] > thold) { + while (k < n_samples - 1 && ctx->energy[k] > thold) { + k++; + } + tokens[j].t1 = sample_to_timestamp(k); + if (j < ns - 1 && tokens[j].t1 > tokens[j + 1].t0) { + tokens[j].t1 = tokens[j + 1].t0; + } else { + s1 = k; + } + } else { + while (ctx->energy[k] < thold && k > s0) { + k--; + } + s1 = k; + tokens[j].t1 = sample_to_timestamp(k); + } + } + } + } + + // fixed token expand (optional) + //{ + // const int t_expand = 0; + + // for (int j = 0; j < n; j++) { + // if (j > 0) { + // tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand)); + // } + // if (j < n - 1) { + // tokens[j].t1 = tokens[j].t1 + t_expand; + // } + // } + //} + + // debug info + //for (int j = 0; j < n; ++j) { + // const auto & token = tokens[j]; + // const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]"; + // printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__, + // tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(ctx, token.id)); + + // if (tokens[j].id >= whisper_token_eot(ctx)) { + // continue; + // } + //} +} diff --git a/examples/whisper.android/app/src/main/jni/whisper/libwhisper/whisper.h b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/whisper.h new file mode 100644 index 0000000..def77d4 --- /dev/null +++ b/examples/whisper.android/app/src/main/jni/whisper/libwhisper/whisper.h @@ -0,0 +1,296 @@ +#ifndef WHISPER_H +#define WHISPER_H + +#include +#include + +#ifdef WHISPER_SHARED +# ifdef _WIN32 +# ifdef WHISPER_BUILD +# define WHISPER_API __declspec(dllexport) +# else +# define WHISPER_API __declspec(dllimport) +# endif +# else +# define WHISPER_API __attribute__ ((visibility ("default"))) +# endif +#else +# define WHISPER_API +#endif + +#define WHISPER_SAMPLE_RATE 16000 +#define WHISPER_N_FFT 400 +#define WHISPER_N_MEL 80 +#define WHISPER_HOP_LENGTH 160 +#define WHISPER_CHUNK_SIZE 30 + +#ifdef __cplusplus +extern "C" { +#endif + + // + // C interface + // + // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads + // concurrently. + // + // Basic usage: + // + // #include "whisper.h" + // + // ... + // + // struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin"); + // + // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { + // fprintf(stderr, "failed to process audio\n"); + // return 7; + // } + // + // const int n_segments = whisper_full_n_segments(ctx); + // for (int i = 0; i < n_segments; ++i) { + // const char * text = whisper_full_get_segment_text(ctx, i); + // printf("%s", text); + // } + // + // whisper_free(ctx); + // + // ... + // + // This is a demonstration of the most straightforward usage of the library. + // "pcmf32" contains the RAW audio data in 32-bit floating point format. + // + // The interface also allows for more fine-grained control over the computation, but it requires a deeper + // understanding of how the model works. + // + + struct whisper_context; + + typedef int whisper_token; + + typedef struct whisper_token_data { + whisper_token id; // token id + whisper_token tid; // forced timestamp token id + + float p; // probability of the token + float pt; // probability of the timestamp token + float ptsum; // sum of probabilities of all timestamp tokens + + // token-level timestamp data + // do not use if you haven't computed token-level timestamps + int64_t t0; // start time of the token + int64_t t1; // end time of the token + + float vlen; // voice length of the token + } whisper_token_data; + + // Allocates all memory needed for the model and loads the model from the given file. + // Returns NULL on failure. + WHISPER_API struct whisper_context * whisper_init(const char * path_model); + + // Frees all memory allocated by the model. + WHISPER_API void whisper_free(struct whisper_context * ctx); + + // Convert RAW PCM audio to log mel spectrogram. + // The resulting spectrogram is stored inside the provided whisper context. + // Returns 0 on success + WHISPER_API int whisper_pcm_to_mel( + struct whisper_context * ctx, + const float * samples, + int n_samples, + int n_threads); + + // This can be used to set a custom log mel spectrogram inside the provided whisper context. + // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. + // n_mel must be 80 + // Returns 0 on success + WHISPER_API int whisper_set_mel( + struct whisper_context * ctx, + const float * data, + int n_len, + int n_mel); + + // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context. + // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first. + // offset can be used to specify the offset of the first frame in the spectrogram. + // Returns 0 on success + WHISPER_API int whisper_encode( + struct whisper_context * ctx, + int offset, + int n_threads); + + // Run the Whisper decoder to obtain the logits and probabilities for the next token. + // Make sure to call whisper_encode() first. + // tokens + n_tokens is the provided context for the decoder. + // n_past is the number of tokens to use from previous decoder calls. + // Returns 0 on success + WHISPER_API int whisper_decode( + struct whisper_context * ctx, + const whisper_token * tokens, + int n_tokens, + int n_past, + int n_threads); + + // Token sampling methods. + // These are provided for convenience and can be used after each call to whisper_decode(). + // You can also implement your own sampling method using the whisper_get_probs() function. + // whisper_sample_best() returns the token with the highest probability + // whisper_sample_timestamp() returns the most probable timestamp token + WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx); + WHISPER_API whisper_token_data whisper_sample_timestamp(struct whisper_context * ctx, bool is_initial); + + // Return the id of the specified language, returns -1 if not found + WHISPER_API int whisper_lang_id(const char * lang); + + WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length + WHISPER_API int whisper_n_vocab (struct whisper_context * ctx); + WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx); + WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx); + + // The probabilities for the next token + WHISPER_API float * whisper_get_probs(struct whisper_context * ctx); + + // Token Id -> String. Uses the vocabulary in the provided context + WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token); + + // Special tokens + WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx); + + // Task tokens + WHISPER_API whisper_token whisper_token_translate (void); + WHISPER_API whisper_token whisper_token_transcribe(void); + + // Performance information + WHISPER_API void whisper_print_timings(struct whisper_context * ctx); + WHISPER_API void whisper_reset_timings(struct whisper_context * ctx); + + // Print system information + WHISPER_API const char * whisper_print_system_info(void); + + //////////////////////////////////////////////////////////////////////////// + + // Available sampling strategies + enum whisper_sampling_strategy { + WHISPER_SAMPLING_GREEDY, // Always select the most probable token + WHISPER_SAMPLING_BEAM_SEARCH, // TODO: not implemented yet! + }; + + // Text segment callback + // Called on every newly generated text segment + // Use the whisper_full_...() functions to obtain the text segments + typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data); + + // Encoder begin callback + // If not NULL, called before the encoder starts + // If it returns false, the computation is aborted + typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data); + + // Parameters for the whisper_full() function + // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp: + // whisper_full_default_params() + struct whisper_full_params { + enum whisper_sampling_strategy strategy; + + int n_threads; + int n_max_text_ctx; + int offset_ms; // start offset in ms + int duration_ms; // audio duration to process in ms + + bool translate; + bool no_context; + bool single_segment; // force single segment output (useful for streaming) + bool print_special; + bool print_progress; + bool print_realtime; + bool print_timestamps; + + // [EXPERIMENTAL] token-level timestamps + bool token_timestamps; // enable token-level timestamps + float thold_pt; // timestamp token probability threshold (~0.01) + float thold_ptsum; // timestamp token sum probability threshold (~0.01) + int max_len; // max segment length in characters + int max_tokens; // max tokens per segment (0 = no limit) + + // [EXPERIMENTAL] speed-up techniques + bool speed_up; // speed-up the audio by 2x using Phase Vocoder + int audio_ctx; // overwrite the audio context size (0 = use default) + + // tokens to provide the whisper model as initial prompt + // these are prepended to any existing text context from a previous call + const whisper_token * prompt_tokens; + int prompt_n_tokens; + + const char * language; + + struct { + int n_past; + } greedy; + + struct { + int n_past; + int beam_width; + int n_best; + } beam_search; + + whisper_new_segment_callback new_segment_callback; + void * new_segment_callback_user_data; + + whisper_encoder_begin_callback encoder_begin_callback; + void * encoder_begin_callback_user_data; + }; + + WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy); + + // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text + // Uses the specified decoding strategy to obtain the text. + WHISPER_API int whisper_full( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples); + + // Split the input audio in chunks and process each chunk separately using whisper_full() + // It seems this approach can offer some speedup in some cases. + // However, the transcription accuracy can be worse at the beginning and end of each chunk. + WHISPER_API int whisper_full_parallel( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples, + int n_processors); + + // Number of generated text segments. + // A segment can be a few words, a sentence, or even a paragraph. + WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx); + + // Get the start and end time of the specified segment. + WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment); + WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment); + + // Get the text of the specified segment. + WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment); + + // Get number of tokens in the specified segment. + WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment); + + // Get the token text of the specified token in the specified segment. + WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token); + WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token); + + // Get token data for the specified token in the specified segment. + // This contains probabilities, timestamps, etc. + WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token); + + // Get the probability of the specified token in the specified segment. + WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/examples/whisper.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/whisper.android/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 0000000..07d5da9 --- /dev/null +++ b/examples/whisper.android/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/whisper.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/whisper.android/app/src/main/res/drawable/ic_launcher_foreground.xml new file mode 100644 index 0000000..2b068d1 --- /dev/null +++ b/examples/whisper.android/app/src/main/res/drawable/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/whisper.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml new file mode 100644 index 0000000..eca70cf --- /dev/null +++ b/examples/whisper.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/res/values/colors.xml b/examples/whisper.android/app/src/main/res/values/colors.xml new file mode 100644 index 0000000..f8c6127 --- /dev/null +++ b/examples/whisper.android/app/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/res/values/strings.xml b/examples/whisper.android/app/src/main/res/values/strings.xml new file mode 100644 index 0000000..40804ec --- /dev/null +++ b/examples/whisper.android/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + WhisperCppDemo + \ No newline at end of file diff --git a/examples/whisper.android/app/src/main/res/values/themes.xml b/examples/whisper.android/app/src/main/res/values/themes.xml new file mode 100644 index 0000000..c16729f --- /dev/null +++ b/examples/whisper.android/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +