Image2Image Encoder

Encoder
pull/116/head
Timothy Kautz 1 year ago
parent 6cd5c7a760
commit b7280f4aa9

@ -0,0 +1,29 @@
// For licensing see accompanying LICENSE.md file.
// Copyright (C) 2022 Apple Inc. All Rights Reserved.
import Foundation
public struct AlphasCumprodCalculation {
public var sqrtAlphasCumprod: Float
public var sqrtOneMinusAlphasCumprod: Float
public init(
sqrtAlphasCumprod: Float,
sqrtOneMinusAlphasCumprod: Float
) {
self.sqrtAlphasCumprod = sqrtAlphasCumprod
self.sqrtOneMinusAlphasCumprod = sqrtOneMinusAlphasCumprod
}
public init(
alphasCumprod: [Float],
timesteps: Int = 1_000,
steps: Int,
strength: Float
) {
let tEnc = Int(strength * Float(steps))
let initTimestep = timesteps - timesteps / steps * (steps - tEnc) + 1
self.sqrtAlphasCumprod = alphasCumprod[initTimestep].squareRoot()
self.sqrtOneMinusAlphasCumprod = (1 - alphasCumprod[initTimestep]).squareRoot()
}
}

@ -0,0 +1,120 @@
// For licensing see accompanying LICENSE.md file.
// Copyright (C) 2022 Apple Inc. All Rights Reserved.
import Foundation
import Accelerate
import CoreML
@available(iOS 16.0, macOS 13.0, *)
extension CGImage {
typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
public enum ShapedArrayError: String, Swift.Error {
case wrongNumberOfChannels
case incorrectFormatsConvertingToShapedArray
case vImageConverterNotInitialized
}
public static func fromShapedArray(_ array: MLShapedArray<Float32>) throws -> CGImage {
// array is [N,C,H,W], where C==3
let channelCount = array.shape[1]
guard channelCount == 3 else {
throw ShapedArrayError.wrongNumberOfChannels
}
let height = array.shape[2]
let width = array.shape[3]
// Normalize each channel into a float between 0 and 1.0
let floatChannels = (0..<channelCount).map { i in
// Normalized channel output
let cOut = PixelBufferPFx1(width: width, height:height)
// Reference this channel in the array and normalize
array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
width: width, height: height,
byteCountPerRow: strides[0]*4)
// Map [-1.0 1.0] -> [0.0 1.0]
cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
}
return cOut
}
// Convert to interleaved and then to UInt8
let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
let uint8Image = PixelBufferI8x3(width: width, height: height)
floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
// Convert to uint8x3 to RGB CGImage (no alpha)
let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
let cgImage = uint8Image.makeCGImage(cgImageFormat:
.init(bitsPerComponent: 8,
bitsPerPixel: 3*8,
colorSpace: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: bitmapInfo)!)!
return cgImage
}
public var plannerRGBShapedArray: MLShapedArray<Float32> {
get throws {
guard
var sourceFormat = vImage_CGImageFormat(cgImage: self),
var mediumFormat = vImage_CGImageFormat(
bitsPerComponent: 8 * MemoryLayout<UInt8>.size,
bitsPerPixel: 8 * MemoryLayout<UInt8>.size * 4,
colorSpace: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.first.rawValue)),
let width = vImagePixelCount(exactly: self.width),
let height = vImagePixelCount(exactly: self.height)
else {
throw ShapedArrayError.incorrectFormatsConvertingToShapedArray
}
var sourceImageBuffer = try vImage_Buffer(cgImage: self)
var mediumDesination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel)
let converter = vImageConverter_CreateWithCGImageFormat(
&sourceFormat,
&mediumFormat,
nil,
vImage_Flags(kvImagePrintDiagnosticsToConsole),
nil)
guard let converter = converter?.takeRetainedValue() else {
throw ShapedArrayError.vImageConverterNotInitialized
}
vImageConvert_AnyToAny(converter, &sourceImageBuffer, &mediumDesination, nil, vImage_Flags(kvImagePrintDiagnosticsToConsole))
var destinationA = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
var minFloat: [Float] = [-1.0, -1.0, -1.0, -1.0]
var maxFloat: [Float] = [1.0, 1.0, 1.0, 1.0]
vImageConvert_ARGB8888toPlanarF(&mediumDesination, &destinationA, &destinationR, &destinationG, &destinationB, &maxFloat, &minFloat, .zero)
let redData = Data(bytes: destinationR.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
let greenData = Data(bytes: destinationG.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
let blueData = Data(bytes: destinationB.data, count: Int(width) * Int(height) * MemoryLayout<Float>.size)
let imageData = redData + greenData + blueData
let shapedArray = MLShapedArray<Float32>(data: imageData, shape: [1, 3, 512, 512])
return shapedArray
}
}
}

@ -3,7 +3,6 @@
import Foundation
import CoreML
import Accelerate
/// A decoder model which produces RGB images from latent samples
@available(iOS 16.2, macOS 13.1, *)
@ -57,12 +56,11 @@ public struct Decoder: ResourceManaging {
}
// Transform the outputs to CGImages
let images: [CGImage] = (0..<results.count).map { i in
let images: [CGImage] = try (0..<results.count).map { i in
let result = results.features(at: i)
let outputName = result.featureNames.first!
let output = result.featureValue(for: outputName)!.multiArrayValue!
return toRGBCGImage(MLShapedArray<Float32>(output))
return try CGImage.fromShapedArray(MLShapedArray<Float32>(output))
}
return images
@ -74,50 +72,4 @@ public struct Decoder: ResourceManaging {
}
}
typealias PixelBufferPFx1 = vImage.PixelBuffer<vImage.PlanarF>
typealias PixelBufferP8x3 = vImage.PixelBuffer<vImage.Planar8x3>
typealias PixelBufferIFx3 = vImage.PixelBuffer<vImage.InterleavedFx3>
typealias PixelBufferI8x3 = vImage.PixelBuffer<vImage.Interleaved8x3>
func toRGBCGImage(_ array: MLShapedArray<Float32>) -> CGImage {
// array is [N,C,H,W], where C==3
let channelCount = array.shape[1]
assert(channelCount == 3,
"Decoding model output has \(channelCount) channels, expected 3")
let height = array.shape[2]
let width = array.shape[3]
// Normalize each channel into a float between 0 and 1.0
let floatChannels = (0..<channelCount).map { i in
// Normalized channel output
let cOut = PixelBufferPFx1(width: width, height:height)
// Reference this channel in the array and normalize
array[0][i].withUnsafeShapedBufferPointer { ptr, _, strides in
let cIn = PixelBufferPFx1(data: .init(mutating: ptr.baseAddress!),
width: width, height: height,
byteCountPerRow: strides[0]*4)
// Map [-1.0 1.0] -> [0.0 1.0]
cIn.multiply(by: 0.5, preBias: 1.0, postBias: 0.0, destination: cOut)
}
return cOut
}
// Convert to interleaved and then to UInt8
let floatImage = PixelBufferIFx3(planarBuffers: floatChannels)
let uint8Image = PixelBufferI8x3(width: width, height: height)
floatImage.convert(to:uint8Image) // maps [0.0 1.0] -> [0 255] and clips
// Convert to uint8x3 to RGB CGImage (no alpha)
let bitmapInfo = CGBitmapInfo(rawValue: CGImageAlphaInfo.none.rawValue)
let cgImage = uint8Image.makeCGImage(cgImageFormat:
.init(bitsPerComponent: 8,
bitsPerPixel: 3*8,
colorSpace: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: bitmapInfo)!)!
return cgImage
}
}

@ -0,0 +1,80 @@
// For licensing see accompanying LICENSE.md file.
// Copyright (C) 2022 Apple Inc. All Rights Reserved.
import Foundation
import CoreML
@available(iOS 16.0, macOS 13.0, *)
/// Encoder, currently supports image2image
public struct Encoder {
public enum Error: String, Swift.Error {
case latentOutputNotValid
case batchLatentOutputEmpty
}
/// VAE encoder model + post math and adding noise from schedular
var model: MLModel
/// Create decoder from Core ML model
///
/// - Parameters
/// - model: Core ML model for VAE decoder
public init(model: MLModel) {
self.model = model
}
/// Prediction queue
let queue = DispatchQueue(label: "encoder.predict")
/// Batch encode latent samples into images
/// - Parameters:
/// - image: image used for image2image
/// - diagonalNoise: random noise for `DiagonalGaussianDistribution` operation
/// - noise: random noise for initial latent space based on strength argument
/// - alphasCumprodStep: calculations using the scheduler traditionally calculated in the pipeline in pyTorch Diffusers library.
/// - Returns: The encoded latent space as MLShapedArray
public func encode(
image: CGImage,
diagonalNoise: MLShapedArray<Float32>,
noise: MLShapedArray<Float32>,
alphasCumprodStep: AlphasCumprodCalculation
) throws -> MLShapedArray<Float32> {
let sample = try image.plannerRGBShapedArray
let sqrtAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtAlphasCumprod], shape: [1, 1])
let sqrtOneMinusAlphasCumprod = MLShapedArray(scalars: [alphasCumprodStep.sqrtOneMinusAlphasCumprod], shape: [1, 1])
let dict: [String: Any] = [
"sample": MLMultiArray(sample),
"diagonalNoise": MLMultiArray(diagonalNoise),
"noise": MLMultiArray(noise),
"sqrtAlphasCumprod": MLMultiArray(sqrtAlphasCumprod),
"sqrtOneMinusAlphasCumprod": MLMultiArray(sqrtOneMinusAlphasCumprod),
]
let featureProvider = try MLDictionaryFeatureProvider(dictionary: dict)
let batch = MLArrayBatchProvider(array: [featureProvider])
// Batch predict with model
let results = try queue.sync { try model.predictions(fromBatch: batch) }
let batchLatents: [MLShapedArray<Float32>] = try (0..<results.count).compactMap { i in
let result = results.features(at: i)
guard
let outputName = result.featureNames.first,
let output = result.featureValue(for: outputName)?.multiArrayValue
else {
throw Error.latentOutputNotValid
}
print("output.shape: \(output.shape)")
return MLShapedArray(output)
}
guard let latents = batchLatents.first else {
throw Error.batchLatentOutputEmpty
}
return latents
}
}
Loading…
Cancel
Save