diff --git a/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift new file mode 100644 index 0000000..350ba3a --- /dev/null +++ b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift @@ -0,0 +1,29 @@ +// For licensing see accompanying LICENSE.md file. +// Copyright (C) 2022 Apple Inc. All Rights Reserved. + +import Foundation + +public struct AlphasCumprodCalculation { + public var sqrtAlphasCumprod: Float + public var sqrtOneMinusAlphasCumprod: Float + + public init( + sqrtAlphasCumprod: Float, + sqrtOneMinusAlphasCumprod: Float + ) { + self.sqrtAlphasCumprod = sqrtAlphasCumprod + self.sqrtOneMinusAlphasCumprod = sqrtOneMinusAlphasCumprod + } + + public init( + alphasCumprod: [Float], + timesteps: Int = 1_000, + steps: Int, + strength: Float + ) { + let tEnc = Int(strength * Float(steps)) + let initTimestep = timesteps - timesteps / steps * (steps - tEnc) + 1 + self.sqrtAlphasCumprod = alphasCumprod[initTimestep].squareRoot() + self.sqrtOneMinusAlphasCumprod = (1 - alphasCumprod[initTimestep]).squareRoot() + } +} diff --git a/swift/StableDiffusion/pipeline/CGImage+vImage.swift b/swift/StableDiffusion/pipeline/CGImage+vImage.swift new file mode 100644 index 0000000..809836e --- /dev/null +++ b/swift/StableDiffusion/pipeline/CGImage+vImage.swift @@ -0,0 +1,120 @@ +// For licensing see accompanying LICENSE.md file. +// Copyright (C) 2022 Apple Inc. All Rights Reserved. + +import Foundation +import Accelerate +import CoreML + +@available(iOS 16.0, macOS 13.0, *) +extension CGImage { + + typealias PixelBufferPFx1 = vImage.PixelBuffer + typealias PixelBufferP8x3 = vImage.PixelBuffer + typealias PixelBufferIFx3 = vImage.PixelBuffer + typealias PixelBufferI8x3 = vImage.PixelBuffer + + public enum ShapedArrayError: String, Swift.Error { + case wrongNumberOfChannels + case incorrectFormatsConvertingToShapedArray + case vImageConverterNotInitialized + } + + public static func fromShapedArray(_ array: MLShapedArray) throws -> CGImage { + + // array is [N,C,H,W], where C==3 + let channelCount = array.shape[1] + guard channelCount == 3 else { + throw ShapedArrayError.wrongNumberOfChannels + } + + let height = array.shape[2] + let width = array.shape[3] + + // Normalize each channel into a float between 0 and 1.0 + let floatChannels = (0.. [0.0 1.0] + cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut) + } + return cOut + } + + // Convert to interleaved and then to UInt8 + let floatImage = PixelBufferIFx3(planarBuffers: floatChannels) + let uint8Image = PixelBufferI8x3(width: width, height: height) + floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips + + // Convert to uint8x3 to RGB CGImage (no alpha) + let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue) + let cgImage = uint8Image.makeCGImage(cgImageFormat: + .init(bitsPerComponent: 8, + bitsPerPixel: 3*8, + colorSpace: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: bitmapInfo)!)! + + return cgImage + } + + public var plannerRGBShapedArray: MLShapedArray { + get throws { + guard + var sourceFormat = vImage_CGImageFormat(cgImage: self), + var mediumFormat = vImage_CGImageFormat( + bitsPerComponent: 8 * MemoryLayout.size, + bitsPerPixel: 8 * MemoryLayout.size * 4, + colorSpace: CGColorSpaceCreateDeviceRGB(), + bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.first.rawValue)), + let width = vImagePixelCount(exactly: self.width), + let height = vImagePixelCount(exactly: self.height) + else { + throw ShapedArrayError.incorrectFormatsConvertingToShapedArray + } + + var sourceImageBuffer = try vImage_Buffer(cgImage: self) + + var mediumDesination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel) + + let converter = vImageConverter_CreateWithCGImageFormat( + &sourceFormat, + &mediumFormat, + nil, + vImage_Flags(kvImagePrintDiagnosticsToConsole), + nil) + + guard let converter = converter?.takeRetainedValue() else { + throw ShapedArrayError.vImageConverterNotInitialized + } + + vImageConvert_AnyToAny(converter, &sourceImageBuffer, &mediumDesination, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole)) + + var destinationA = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout.size)) + + var minFloat: [Float] = [-1.0, -1.0, -1.0, -1.0] + var maxFloat: [Float] = [1.0, 1.0, 1.0, 1.0] + + vImageConvert_ARGB8888toPlanarF(&mediumDesination, &destinationA, &destinationR, &destinationG, &destinationB, &maxFloat, &minFloat, .zero) + + let redData = Data(bytes: destinationR.data, count: Int(width) * Int(height) * MemoryLayout.size) + let greenData = Data(bytes: destinationG.data, count: Int(width) * Int(height) * MemoryLayout.size) + let blueData = Data(bytes: destinationB.data, count: Int(width) * Int(height) * MemoryLayout.size) + + let imageData = redData + greenData + blueData + + let shapedArray = MLShapedArray(data: imageData, shape: [1, 3, 512, 512]) + + return shapedArray + } + } +} + diff --git a/swift/StableDiffusion/pipeline/Decoder.swift b/swift/StableDiffusion/pipeline/Decoder.swift index 04f04ba..e9b2c70 100644 --- a/swift/StableDiffusion/pipeline/Decoder.swift +++ b/swift/StableDiffusion/pipeline/Decoder.swift @@ -3,7 +3,6 @@ import Foundation import CoreML -import Accelerate /// A decoder model which produces RGB images from latent samples @available(iOS 16.2, macOS 13.1, *) @@ -57,12 +56,11 @@ public struct Decoder: ResourceManaging { } // Transform the outputs to CGImages - let images: [CGImage] = (0..(output)) + return try CGImage.fromShapedArray(MLShapedArray(output)) } return images @@ -74,50 +72,4 @@ public struct Decoder: ResourceManaging { } } - typealias PixelBufferPFx1 = vImage.PixelBuffer - typealias PixelBufferP8x3 = vImage.PixelBuffer - typealias PixelBufferIFx3 = vImage.PixelBuffer - typealias PixelBufferI8x3 = vImage.PixelBuffer - - func toRGBCGImage(_ array: MLShapedArray) -> CGImage { - - // array is [N,C,H,W], where C==3 - let channelCount = array.shape[1] - assert(channelCount == 3, - "Decoding model output has \(channelCount) channels, expected 3") - let height = array.shape[2] - let width = array.shape[3] - - // Normalize each channel into a float between 0 and 1.0 - let floatChannels = (0.. [0.0 1.0] - cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut) - } - return cOut - } - - // Convert to interleaved and then to UInt8 - let floatImage = PixelBufferIFx3(planarBuffers: floatChannels) - let uint8Image = PixelBufferI8x3(width: width, height: height) - floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips - - // Convert to uint8x3 to RGB CGImage (no alpha) - let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue) - let cgImage = uint8Image.makeCGImage(cgImageFormat: - .init(bitsPerComponent: 8, - bitsPerPixel: 3*8, - colorSpace: CGColorSpaceCreateDeviceRGB(), - bitmapInfo: bitmapInfo)!)! - - return cgImage - } } diff --git a/swift/StableDiffusion/pipeline/Encoder.swift b/swift/StableDiffusion/pipeline/Encoder.swift new file mode 100644 index 0000000..043865c --- /dev/null +++ b/swift/StableDiffusion/pipeline/Encoder.swift @@ -0,0 +1,80 @@ +// For licensing see accompanying LICENSE.md file. +// Copyright (C) 2022 Apple Inc. All Rights Reserved. + +import Foundation +import CoreML + +@available(iOS 16.0, macOS 13.0, *) +/// Encoder, currently supports image2image +public struct Encoder { + + public enum Error: String, Swift.Error { + case latentOutputNotValid + case batchLatentOutputEmpty + } + + /// VAE encoder model + post math and adding noise from schedular + var model: MLModel + + /// Create decoder from Core ML model + /// + /// - Parameters + /// - model: Core ML model for VAE decoder + public init(model: MLModel) { + self.model = model + } + + /// Prediction queue + let queue = DispatchQueue(label: "encoder.predict") + + /// Batch encode latent samples into images + /// - Parameters: + /// - image: image used for image2image + /// - diagonalNoise: random noise for `DiagonalGaussianDistribution` operation + /// - noise: random noise for initial latent space based on strength argument + /// - alphasCumprodStep: calculations using the scheduler traditionally calculated in the pipeline in pyTorch Diffusers library. + /// - Returns: The encoded latent space as MLShapedArray + public func encode( + image: CGImage, + diagonalNoise: MLShapedArray, + noise: MLShapedArray, + alphasCumprodStep: AlphasCumprodCalculation + ) throws -> MLShapedArray { + let sample = try image.plannerRGBShapedArray + let sqrtAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtAlphasCumprod], shape: [1, 1]) + let sqrtOneMinusAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtOneMinusAlphasCumprod], shape: [1, 1]) + + let dict: [String: Any] = [ + "sample": MLMultiArray(sample), + "diagonalNoise": MLMultiArray(diagonalNoise), + "noise": MLMultiArray(noise), + "sqrtAlphasCumprod": MLMultiArray(sqrtAlphasCumprod), + "sqrtOneMinusAlphasCumprod": MLMultiArray(sqrtOneMinusAlphasCumprod), + ] + let featureProvider = try MLDictionaryFeatureProvider(dictionary: dict) + + let batch = MLArrayBatchProvider(array: [featureProvider]) + + // Batch predict with model + let results = try queue.sync { try model.predictions(fromBatch: batch) } + + let batchLatents: [MLShapedArray] = try (0..