Image2Image Encoder

Encoder
3 years ago · b7280f4aa9
parent 6cd5c7a760
commit b7280f4aa9
4 changed files with 231 additions and 50 deletions
--- a/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
+++ b/swift/StableDiffusion/pipeline/AlphasCumprodCalculation.swift
@ -0,0 +1,29 @@
 // For licensing see accompanying LICENSE.md file.
 // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 import Foundation
 public struct AlphasCumprodCalculation {
    public var sqrtAlphasCumprod: Float
    public var sqrtOneMinusAlphasCumprod: Float
    public init(
        sqrtAlphasCumprod: Float,
        sqrtOneMinusAlphasCumprod: Float
    ) {
        self.sqrtAlphasCumprod = sqrtAlphasCumprod
        self.sqrtOneMinusAlphasCumprod = sqrtOneMinusAlphasCumprod
    }
    public init(
        alphasCumprod: [Float],
        timesteps: Int = 1_000,
        steps: Int,
        strength: Float
    ) {
        let tEnc = Int(strength * Float(steps))
        let initTimestep = timesteps - timesteps / steps * (steps - tEnc) + 1
        self.sqrtAlphasCumprod = alphasCumprod[initTimestep].squareRoot()
        self.sqrtOneMinusAlphasCumprod = (1 - alphasCumprod[initTimestep]).squareRoot()
    }
 }
--- a/swift/StableDiffusion/pipeline/CGImage+vImage.swift
+++ b/swift/StableDiffusion/pipeline/CGImage+vImage.swift
@ -0,0 +1,120 @@
 // For licensing see accompanying LICENSE.md file.
 // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 import Foundation
 import Accelerate
 import CoreML
@available(iOS 16.0, macOS 13.0, *)
 extension CGImage {
    typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
    typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
    typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
    typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
    public enum ShapedArrayError: String, Swift.Error {
        case wrongNumberOfChannels
        case incorrectFormatsConvertingToShapedArray
        case vImageConverterNotInitialized
    }
    public static func fromShapedArray(_ array: MLShapedArray<Float32>) throws -> CGImage {
        // array is [N,C,H,W], where C==3
        let channelCount = array.shape[1]
        guard channelCount == 3 else {
            throw ShapedArrayError.wrongNumberOfChannels
        }
        let height = array.shape[2]
        let width = array.shape[3]
        // Normalize each channel into a float between 0 and 1.0
        let floatChannels = (0..<channelCount).map { i in
            // Normalized channel output
            let cOut = PixelBufferPFx1(width: width, height:height)
            // Reference this channel in the array and normalize
            array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
                let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
                                          width: width, height: height,
                                          byteCountPerRow: strides[0]*4)
                // Map [-1.0 1.0] -> [0.0 1.0]
                cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
            }
            return cOut
        }
        // Convert to interleaved and then to UInt8
        let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
        let uint8Image = PixelBufferI8x3(width: width, height: height)
        floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
        // Convert to uint8x3 to RGB CGImage (no alpha)
        let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
        let cgImage = uint8Image.makeCGImage(cgImageFormat:
                .init(bitsPerComponent: 8,
                      bitsPerPixel: 3*8,
                      colorSpace: CGColorSpaceCreateDeviceRGB(),
                      bitmapInfo: bitmapInfo)!)!
        return cgImage
    }
    public var plannerRGBShapedArray: MLShapedArray<Float32> {
        get throws {
            guard
                var sourceFormat = vImage_CGImageFormat(cgImage: self),
                var mediumFormat = vImage_CGImageFormat(
                    bitsPerComponent: 8 * MemoryLayout<UInt8>.size,
                    bitsPerPixel: 8 * MemoryLayout<UInt8>.size * 4,
                    colorSpace: CGColorSpaceCreateDeviceRGB(),
                    bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.first.rawValue)),
                let width = vImagePixelCount(exactly: self.width),
                let height = vImagePixelCount(exactly: self.height)
            else {
                throw ShapedArrayError.incorrectFormatsConvertingToShapedArray
            }
            var sourceImageBuffer = try vImage_Buffer(cgImage: self)
            var mediumDesination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel)
            let converter = vImageConverter_CreateWithCGImageFormat(
                &sourceFormat,
                &mediumFormat,
                nil,
                vImage_Flags(kvImagePrintDiagnosticsToConsole),
                nil)
            guard let converter = converter?.takeRetainedValue() else {
                throw ShapedArrayError.vImageConverterNotInitialized
            }
            vImageConvert_AnyToAny(converter, &sourceImageBuffer, &mediumDesination, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole))
            var destinationA = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
            var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
            var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
            var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
            var minFloat: [Float] = [-1.0, -1.0, -1.0, -1.0]
            var maxFloat: [Float] = [1.0, 1.0, 1.0, 1.0]
            vImageConvert_ARGB8888toPlanarF(&mediumDesination, &destinationA, &destinationR, &destinationG, &destinationB, &maxFloat, &minFloat, .zero)
            let redData = Data(bytes: destinationR.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
            let greenData = Data(bytes: destinationG.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
            let blueData = Data(bytes: destinationB.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
            let imageData = redData + greenData + blueData
            let shapedArray = MLShapedArray<Float32>(data: imageData, shape: [1, 3, 512, 512])
            return shapedArray
        }
    }
 }
--- a/swift/StableDiffusion/pipeline/Decoder.swift
+++ b/swift/StableDiffusion/pipeline/Decoder.swift
@ -3,7 +3,6 @@
 import Foundation
 import CoreML
 import Accelerate
 /// A decoder model which produces RGB images from latent samples
@available(iOS 16.2, macOS 13.1, *)
@ -57,12 +56,11 @@ public struct Decoder: ResourceManaging {
        }
        // Transform the outputs to CGImages
-        let images: [CGImage] = (0..<results.count).map { i in
+        let images: [CGImage] = try (0..<results.count).map { i in
            let result = results.features(at: i)
            let outputName = result.featureNames.first!
            let output = result.featureValue(for: outputName)!.multiArrayValue!
-
+            return try CGImage.fromShapedArray(MLShapedArray<Float32>(output))
            return toRGBCGImage(MLShapedArray<Float32>(output))
        }
        return images
@ -74,50 +72,4 @@ public struct Decoder: ResourceManaging {
        }
    }
    typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
    typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
    typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
    typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
    func toRGBCGImage(_ array: MLShapedArray<Float32>) -> CGImage {
        // array is [N,C,H,W], where C==3
        let channelCount = array.shape[1]
        assert(channelCount == 3,
               "Decoding model output has \(channelCount) channels, expected 3")
        let height = array.shape[2]
        let width = array.shape[3]
        // Normalize each channel into a float between 0 and 1.0
        let floatChannels = (0..<channelCount).map { i in
            // Normalized channel output
            let cOut = PixelBufferPFx1(width: width, height:height)
            // Reference this channel in the array and normalize
            array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
                let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
                                          width: width, height: height,
                                          byteCountPerRow: strides[0]*4)
                // Map [-1.0 1.0] -> [0.0 1.0]
                cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
            }
            return cOut
        }
        // Convert to interleaved and then to UInt8
        let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
        let uint8Image = PixelBufferI8x3(width: width, height: height)
        floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
        // Convert to uint8x3 to RGB CGImage (no alpha)
        let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
        let cgImage = uint8Image.makeCGImage(cgImageFormat:
                .init(bitsPerComponent: 8,
                      bitsPerPixel: 3*8,
                      colorSpace: CGColorSpaceCreateDeviceRGB(),
                      bitmapInfo: bitmapInfo)!)!
        return cgImage
    }
 }
--- a/swift/StableDiffusion/pipeline/Encoder.swift
+++ b/swift/StableDiffusion/pipeline/Encoder.swift
@ -0,0 +1,80 @@
 // For licensing see accompanying LICENSE.md file.
 // Copyright (C) 2022 Apple Inc. All Rights Reserved.
 import Foundation
 import CoreML
@available(iOS 16.0, macOS 13.0, *)
 /// Encoder, currently supports image2image
 public struct Encoder {
    public enum Error: String, Swift.Error {
        case latentOutputNotValid
        case batchLatentOutputEmpty
    }
    /// VAE encoder model + post math and adding noise from schedular
    var model: MLModel
    /// Create decoder from Core ML model
    ///
    /// - Parameters
    ///     - model: Core ML model for VAE decoder
    public init(model: MLModel) {
        self.model = model
    }
    /// Prediction queue
    let queue = DispatchQueue(label: "encoder.predict")
    /// Batch encode latent samples into images
    /// - Parameters:
    ///   - image: image used for image2image
    ///   - diagonalNoise: random noise for `DiagonalGaussianDistribution` operation
    ///   - noise: random noise for initial latent space based on strength argument
    ///   - alphasCumprodStep: calculations using the scheduler traditionally calculated in the pipeline in pyTorch Diffusers library.
    /// - Returns: The encoded latent space as MLShapedArray
    public func encode(
        image:  CGImage,
        diagonalNoise: MLShapedArray<Float32>,
        noise: MLShapedArray<Float32>,
        alphasCumprodStep: AlphasCumprodCalculation
    ) throws -> MLShapedArray<Float32> {
        let sample = try image.plannerRGBShapedArray
        let sqrtAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtAlphasCumprod], shape: [1, 1])
        let sqrtOneMinusAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtOneMinusAlphasCumprod], shape: [1, 1])
        let dict: [String: Any] = [
            "sample": MLMultiArray(sample),
            "diagonalNoise": MLMultiArray(diagonalNoise),
            "noise": MLMultiArray(noise),
            "sqrtAlphasCumprod": MLMultiArray(sqrtAlphasCumprod),
            "sqrtOneMinusAlphasCumprod": MLMultiArray(sqrtOneMinusAlphasCumprod),
        ]
        let featureProvider = try MLDictionaryFeatureProvider(dictionary: dict)
        let batch = MLArrayBatchProvider(array: [featureProvider])
        // Batch predict with model
        let results = try queue.sync { try model.predictions(fromBatch: batch) }
        let batchLatents: [MLShapedArray<Float32>] = try (0..<results.count).compactMap { i in
            let result = results.features(at: i)
            guard
                let outputName = result.featureNames.first,
                let output = result.featureValue(for: outputName)?.multiArrayValue
            else {
                throw Error.latentOutputNotValid
            }
            print("output.shape: \(output.shape)")
            return MLShapedArray(output)
        }
        guard let latents = batchLatents.first else {
            throw Error.batchLatentOutputEmpty
        }
        return latents
    }
 }