Scheduler and pipeline

3 years ago · e3a85872d0
parent b7280f4aa9
commit e3a85872d0
4 changed files with 120 additions and 5 deletions
--- a/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
+++ b/swift/StableDiffusion/pipeline/DPMSolverMultistepScheduler.swift
@ -23,12 +23,16 @@ public final class DPMSolverMultistepScheduler: Scheduler {
    public let betas: [Float]
    public let alphas: [Float]
    public let alphasCumProd: [Float]
-    public let timeSteps: [Int]
+    private let timeSteps: [Int]

    public let alpha_t: [Float]
    public let sigma_t: [Float]
    public let lambda_t: [Float]
    
+    public var allTimeSteps: [Int] {
+        timeSteps
+    }
+    
    public let solverOrder = 2
    private(set) var lowerOrderStepped = 0
    
--- a/swift/StableDiffusion/pipeline/Scheduler.swift
+++ b/swift/StableDiffusion/pipeline/Scheduler.swift
@ -10,9 +10,12 @@ public protocol Scheduler {

    /// Number of inference steps to be performed
    var inferenceStepCount: Int { get }
+    
+    /// Training diffusion time steps index by inference time step
+    var allTimeSteps: [Int] { get }

    /// Training diffusion time steps index by inference time step
-    var timeSteps: [Int] { get }
+    func calculateTimesteps(strength: Float?) -> [Int]

    /// Schedule of betas which controls the amount of noise added at each timestep
    var betas: [Float] { get }
@ -71,6 +74,35 @@ public extension Scheduler {
    }
 }

+// MARK: - Image2Image
+
+@available(iOS 16.2, macOS 13.1, *)
+public extension Scheduler {
+    
+    func calculateAlphasCumprod(strength: Float) -> AlphasCumprodCalculation {
+        AlphasCumprodCalculation(
+            alphasCumprod: alphasCumProd,
+            timesteps: trainStepCount,
+            steps: inferenceStepCount,
+            strength: strength)
+    }
+}
+
+// MARK: - Timesteps
+
+@available(iOS 16.2, macOS 13.1, *)
+public extension Scheduler {
+    
+    func calculateTimesteps(strength: Float?) -> [Int] {
+        guard let strength else { return allTimeSteps.reversed() }
+        let startStep = Int(Float(inferenceStepCount) * strength)
+        let acutalTimesteps = Array(allTimeSteps[0..<startStep].reversed())
+        return acutalTimesteps
+    }
+}
+
+// MARK: - BetaSchedule
+
 /// How to map a beta range to a sequence of betas to step over
@available(iOS 16.2, macOS 13.1, *)
 public enum BetaSchedule {
@ -80,6 +112,7 @@ public enum BetaSchedule {
    case scaledLinear
 }

+// MARK: - PNDMScheduler

 /// A scheduler used to compute a de-noised image
 ///
@ -94,7 +127,11 @@ public final class PNDMScheduler: Scheduler {
    public let betas: [Float]
    public let alphas: [Float]
    public let alphasCumProd: [Float]
-    public let timeSteps: [Int]
+    private let timeSteps: [Int]
+    
+    public var allTimeSteps: [Int] {
+        timeSteps
+    }

    // Internal state
    var counter: Int
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline+Resources.swift
@ -14,6 +14,7 @@ public extension StableDiffusionPipeline {
        public let unetChunk1URL: URL
        public let unetChunk2URL: URL
        public let decoderURL: URL
+        public let encoderURL: URL
        public let safetyCheckerURL: URL
        public let vocabURL: URL
        public let mergesURL: URL
@ -24,6 +25,7 @@ public extension StableDiffusionPipeline {
            unetChunk1URL = baseURL.appending(path: "UnetChunk1.mlmodelc")
            unetChunk2URL = baseURL.appending(path: "UnetChunk2.mlmodelc")
            decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
+            encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc")
            safetyCheckerURL = baseURL.appending(path: "SafetyChecker.mlmodelc")
            vocabURL = baseURL.appending(path: "vocab.json")
            mergesURL = baseURL.appending(path: "merges.txt")
@ -74,11 +76,22 @@ public extension StableDiffusionPipeline {
            FileManager.default.fileExists(atPath: urls.safetyCheckerURL.path) {
            safetyChecker = SafetyChecker(modelAt: urls.safetyCheckerURL, configuration: config)
        }
+        
+        // Optional Image Encoder
+        let encoder: Encoder?
+        if
+            let encoderModel = try? MLModel(contentsOf: urls.encoderURL, configuration: config)
+        {
+            encoder = Encoder(model: encoderModel)
+        } else {
+            encoder = nil
+        }

        // Construct pipeline
        self.init(textEncoder: textEncoder,
                  unet: unet,
                  decoder: decoder,
+                  encoder: encoder,
                  safetyChecker: safetyChecker,
                  reduceMemory: reduceMemory)
    }
--- a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
+++ b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@ -20,6 +20,10 @@ public enum StableDiffusionScheduler {
 /// [Hugging Face Diffusers Pipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py)
@available(iOS 16.2, macOS 13.1, *)
 public struct StableDiffusionPipeline: ResourceManaging {
+    
+    public enum Error: String, Swift.Error {
+        case startingImageProvidedWithoutEncoder
+    }

    /// Model to generate embeddings for tokenized input text
    var textEncoder: TextEncoder
@ -29,6 +33,9 @@ public struct StableDiffusionPipeline: ResourceManaging {

    /// Model used to generate final image from latent diffusion process
    var decoder: Decoder
+    
+    /// Model used to latent space for image2image, and soon, in-painting
+    var encoder: Encoder?

    /// Optional model for checking safety of generated image
    var safetyChecker: SafetyChecker? = nil
@ -58,11 +65,13 @@ public struct StableDiffusionPipeline: ResourceManaging {
    public init(textEncoder: TextEncoder,
                unet: Unet,
                decoder: Decoder,
+                encoder: Encoder?,
                safetyChecker: SafetyChecker? = nil,
                reduceMemory: Bool = false) {
        self.textEncoder = textEncoder
        self.unet = unet
        self.decoder = decoder
+        self.encoder = encoder
        self.safetyChecker = safetyChecker
        self.reduceMemory = reduceMemory
    }
@ -114,6 +123,8 @@ public struct StableDiffusionPipeline: ResourceManaging {
    public func generateImages(
        prompt: String,
        negativePrompt: String = "",
+        startingImage: CGImage? = nil,
+        strength: Float = 1.0,
        imageCount: Int = 1,
        stepCount: Int = 50,
        seed: UInt32 = 0,
@ -150,10 +161,31 @@ public struct StableDiffusionPipeline: ResourceManaging {
        let stdev = scheduler[0].initNoiseSigma

        // Generate random latent samples from specified seed
-        var latents = generateLatentSamples(imageCount, stdev: stdev, seed: seed)
+        var latents: [MLShapedArray<Float32>]
+        let timestepStrength: Float?
+        
+        if let startingImage {
+            timestepStrength = strength
+            guard let encoder else {
+                throw Error.startingImageProvidedWithoutEncoder
+            }
+            let noiseTuples = generateImage2ImageLatentSamples(imageCount, stdev: 1, seed: seed)
+            latents = try noiseTuples.map({
+                try encoder.encode(
+                    image: startingImage,
+                    diagonalNoise: $0.diagonal,
+                    noise: $0.latentNoise,
+                    alphasCumprodStep: scheduler[0].calculateAlphasCumprod(strength: strength))
+            })
+        } else {
+            timestepStrength = nil
+            // Generate random latent samples from specified seed
+            latents = generateLatentSamples(imageCount, stdev: stdev, seed: seed)
+        }

        // De-noising loop
-        for (step,t) in scheduler[0].timeSteps.enumerated() {
+        let timeSteps = scheduler[0].calculateTimesteps(strength: timestepStrength)
+        for (step,t) in timeSteps.enumerated() {

            // Expand the latents for classifier-free guidance
            // and input to the Unet noise prediction model
@ -215,6 +247,35 @@ public struct StableDiffusionPipeline: ResourceManaging {
        }
        return samples
    }
+    
+    
+    /// For image2image -
+    /// - Parameters:
+    ///   - count: batch size
+    ///   - stdev: 1
+    ///   - seed: seed provided
+    ///   - diagonalAndLatentNoiseIsSame: Diffusions library does not seem to use the same noise for the `DiagonalGaussianDistribution` operation,
+    ///     but I have seen implementations of pipelines where it is the same.
+    /// - Returns: An array of tuples of noise values with length of batch size.
+    func generateImage2ImageLatentSamples(_ count: Int, stdev: Float, seed: Int, diagonalAndLatentNoiseIsSame: Bool = false) -> [(diagonal: MLShapedArray<Float32>, latentNoise: MLShapedArray<Float32>)] {
+        var sampleShape = unet.latentSampleShape
+        sampleShape[0] = 1
+
+        var random = NumPyRandomSource(seed: UInt32(truncatingIfNeeded: seed))
+        let samples = (0..<count).map { _ in
+            if diagonalAndLatentNoiseIsSame {
+                let noise = MLShapedArray<Float32>(
+                    converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev)))
+                return (noise, noise)
+            } else {
+                return (MLShapedArray<Float32>(
+                    converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev))),
+                        MLShapedArray<Float32>(
+                            converting: random.normalShapedArray(sampleShape, mean: 0.0, stdev: Double(stdev))))
+            }
+        }
+        return samples
+    }

    func toHiddenStates(_ embedding: MLShapedArray<Float32>) -> MLShapedArray<Float32> {
        // Unoptimized manual transpose [0, 2, None, 1]