View file src/colab/stable_diffusion_hybrid.py - Download
# -*- coding: utf-8 -*-
"""stable_diffusion_hybrid.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1VDAXYQOMdGXg37msoTRgcydZQUGx_t8T
Stable Diffusion : image generation from text with negative prompt and hybridation
Links :
- https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb
- https://colab.research.google.com/drive/1dlgggNa5Mz8sEAGU0wFCHhGLFooW_pf1?usp=sharing
- https://colab.research.google.com/drive/1qXCgWuH8VWldCszISA58SKp7w0w7OlJZ?usp=sharing
"""
# Stable Diffusion
# Source : https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb
print("Install packages")
!pip install diffusers==0.11.1
!pip install transformers scipy ftfy accelerate
print("Imports")
import torch
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "google/ddpm-church-256"
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DModel, UNet2DConditionModel, PNDMScheduler
# 1. Load the autoencoder model which will be used to decode the latents into image space.
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
# 2. Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
# 3. The UNet model for generating the latents.
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
# model = UNet2DModel.from_pretrained(repo_id)
from diffusers import LMSDiscreteScheduler
scheduler = LMSDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
from diffusers import DDPMScheduler
# scheduler = DDPMScheduler.from_config(repo_id)
vae = vae.to(torch_device)
text_encoder = text_encoder.to(torch_device)
unet = unet.to(torch_device)
"""Stable Diffusion is based on a particular type of diffusion model called **Latent Diffusion**, proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752).
General diffusion models are machine learning systems that are trained to *denoise* random gaussian noise step by step, to get to a sample of interest, such as an *image*. For a more detailed overview of how they work, check [this colab](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb).
Latent diffusion can reduce the memory and compute complexity by applying the diffusion process over a lower dimensional _latent_ space, instead of using the actual pixel space. This is the key difference between standard diffusion and latent diffusion models: **in latent diffusion the model is trained to generate latent (compressed) representations of the images.**
There are three main components in latent diffusion.
1. An autoencoder (VAE).
2. A [U-Net](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/diffusers_intro.ipynb#scrollTo=wW8o1Wp0zRkq).
3. A text-encoder, *e.g.* [CLIP's Text Encoder](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
**1. The autoencoder (VAE)**
The VAE model has two parts, an encoder and a decoder. The encoder is used to convert the image into a low dimensional latent representation, which will serve as the input to the *U-Net* model.
The decoder, conversely, transforms the latent representation back into an image.
During latent diffusion _training_, the encoder is used to get the latent representations (_latents_) of the images for the forward diffusion process, which applies more and more noise at each step. During _inference_, the denoised latents generated by the reverse diffusion process are converted back into images using the VAE decoder. As we will see during inference we **only need the VAE decoder**.
**2. The U-Net**
The U-Net has an encoder part and a decoder part both comprised of ResNet blocks.
The encoder compresses an image representation into a lower resolution image representation and the decoder decodes the lower resolution image representation back to the original higher resolution image representation that is supposedly less noisy.
More specifically, the U-Net output predicts the noise residual which can be used to compute the predicted denoised image representation.
**3. The Text-encoder**
The text-encoder is responsible for transforming the input prompt, *e.g.* "An astronout riding a horse" into an embedding space that can be understood by the U-Net. It is usually a simple *transformer-based* encoder that maps a sequence of input tokens to a sequence of latent text-embeddings.
Here is a typical chart of a Python script using Stable Diffusion :

Function generating the image from the prompt
Parameters :
- guidance scale
- number of inference steps
- seed
- prompt
- negative prompt
Negative prompt : The empty prompt used for classifier-free guidance is replaced with a negative prompt.
The removed noise is obtained by starting form the noise obtained with the negative prompt and moving toward the noise obtained with the (positive) prompt guidance scale times further.
Prompt hybridation : The simple text prompt is replaced by a list of prompts with mix factors. The embedding used is the linear combination of the embeddings of the different prompts.
"""
from tqdm.auto import tqdm
from torch import autocast
from PIL import Image
def generate(guidance_scale, num_inference_steps, seed, prompts, nprompts=[("",1)]):
batch_size = 1
height = 512
width = 512
# Prompt encoding
#text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
#with torch.no_grad():
# text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
i = 0
for prompt in prompts:
i += 1
text_input = tokenizer([prompt[0]], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
with torch.no_grad():
text_embeddings_i = text_encoder(text_input.input_ids.to(torch_device))[0]
if (i == 1):
text_embeddings = text_embeddings_i * prompt[1]
else:
text_embeddings += text_embeddings_i * prompt[1]
# Prompt amplification :
# The text embeddings are the concatenation of the text embeddings for an empty prompt and the text embeddings for the given prompt
max_length = text_input.input_ids.shape[-1]
i = 0
for nprompt in nprompts:
i += 1
uncond_input = tokenizer(
[nprompt[0]] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
)
with torch.no_grad():
uncond_embeddings_i = text_encoder(uncond_input.input_ids.to(torch_device))[0]
if i == 1:
uncond_embeddings = uncond_embeddings_i * nprompt[1]
else:
uncond_embeddings += uncond_embeddings_i * nprompt[1]
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
# Initial random noise
generator = torch.manual_seed(seed) # Seed generator to create the inital latent noise
latents = torch.randn(
(batch_size, unet.in_channels, height // 8, width // 8),
generator=generator,
)
latents = latents.to(torch_device)
# Denoising loop
scheduler.set_timesteps(num_inference_steps)
latents = latents * scheduler.init_noise_sigma
for t in tqdm(scheduler.timesteps):
# latent_model_input = torch.Tensor(latents)
# Prompt amplification
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
latent_model_input = torch.cat([latents] * 2)
latent_model_input = scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
with torch.no_grad():
noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# Prompt amplification
# perform guidance
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# compute the previous noisy sample x_t -> x_t-1
latents = scheduler.step(noise_pred, t, latents).prev_sample
# Decode and display the image
# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
with torch.no_grad():
image = vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1)
image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
images = (image * 255).round().astype("uint8")
pil_images = [Image.fromarray(image) for image in images]
return(pil_images[0])
"""Generate an image from a simple prompt"""
guidance_scale = 6
num_inference_steps = 20
seed = 13
prompt = [("An astronaut riding a horse", 1)]
image = generate(guidance_scale, num_inference_steps, seed, prompt)
display(image)
"""Prompt hybridation : a flower-bird"""
guidance_scale = 8
num_inference_steps = 12
seed = 32
prompt = [("a flower", 0.52), ("a bird", 0.48)]
image = generate(guidance_scale, num_inference_steps, seed, prompt)
display(image)
"""Negative prompts : gardens with few flowers"""
guidance_scale = 7.5
seeds = range(0, 8)
num_inference_steps = 20
prompt = [("A garden",1)]
nprompt = [("flower", 1)]
for seed in seeds:
image = generate(guidance_scale, num_inference_steps, seed, prompt)
print(f"seed={seed} without negative prompt")
display(image)
image = generate(guidance_scale, num_inference_steps, seed, prompt, nprompt)
print(f"seed={seed} with negative prompt")
display(image)
"""Different mix factors and different guidance scales with the same text give different results.
"""
guidance_scales = [1, 7.5, 12]
seed = 3
num_inference_steps = 20
prompts = [[("A garden", 0.8)], [("A garden", 1)], [("A garden", 1.25)]]
for prompt in prompts:
for guidance_scale in guidance_scales:
image = generate(guidance_scale, num_inference_steps, seed, prompt)
print(f"prompt={prompt} scale={guidance_scale}")
display(image)
"""If the guidance scale is less than one, the negative prompt becomes positive, but the results are strange."""
guidance_scale = 0.52
seeds = range(32, 40)
num_inference_steps = 12
prompt = [("a flower", 1)]
nprompt = [("a bird", 1)]
for seed in seeds:
image = generate(guidance_scale, num_inference_steps, seed, prompt, nprompt)
print(f"seed={seed}")
display(image)