sample_size=28, # 目標圖像的分辨率
in_channels=1, # 輸入圖像的通道數,RGB圖像的通道數為3
out_channels=1, # 輸出圖像的通道數
layers_per_block=2, # 設置要在每一個UNet塊中使用多少個ResNet層
block_out_channels=(32, 64, 64), # 與BasicUNet模型的配置基本相同
down_block_types=(
"DownBlock2D", # 標準的ResNet下采樣模塊
"AttnDownBlock2D", # 帶有空域維度self-att的ResNet下采樣模塊
"AttnDownBlock2D",
),
up_block_types=(
"AttnUpBlock2D",
"AttnUpBlock2D", # 帶有空域維度self-att的ResNet上采樣模塊
"UpBlock2D", # 標準的ResNet上采樣模塊
 
),
)
 
# 輸出模型結構(看起來雖然冗長,但非常清晰)
print(model)

我們繼續來查看一下UNet2DModel模型的參數量,代碼如下:

sum([p.numel() for p in model.parameters()]) 
# UNet2DModel模型使用了大約170萬個參數,BasicUNet模型則使用了30多萬個參數
# 輸出
1707009

下面是我們使用UNet2DModel代替BasicUNet模型,重復前面展示的訓練以及采樣過程(這里t=0,以表明模型是在沒有時間步的情況下訓練的),完整的代碼如下:

#@markdown Trying UNet2DModel instead of BasicUNet:

# Dataloader (you can mess with batch size)
batch_size = 128
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# How many runs through the data should we do?
n_epochs = 3

# Create the network
net = UNet2DModel(
sample_size=28, # the target image resolution
in_channels=1, # the number of input channels, 3 for RGB images
out_channels=1, # the number of output channels
layers_per_block=2, # how many ResNet layers to use per UNet block
block_out_channels=(32, 64, 64), # Roughly matching our basic unet example
down_block_types=(
"DownBlock2D", # a regular ResNet downsampling block
"AttnDownBlock2D", # a ResNet downsampling block with spatial self-attention
"AttnDownBlock2D",
),
up_block_types=(
"AttnUpBlock2D",
"AttnUpBlock2D", # a ResNet upsampling block with spatial self-attention
"UpBlock2D", # a regular ResNet upsampling block
),
) #<<<
net.to(device)

# Our loss finction
loss_fn = nn.MSELoss()

# The optimizer
opt = torch.optim.Adam(net.parameters(), lr=1e-3)

# Keeping a record of the losses for later viewing
losses = []

# The training loop
for epoch in range(n_epochs):

for x, y in train_dataloader:

# Get some data and prepare the corrupted version
x = x.to(device) # Data on the GPU
noise_amount = torch.rand(x.shape[0]).to(device) # Pick random noise amounts
noisy_x = corrupt(x, noise_amount) # Create our noisy x

# Get the model prediction
pred = net(noisy_x, 0).sample #<<< Using timestep 0 always, adding .sample

# Calculate the loss
loss = loss_fn(pred, x) # How close is the output to the true 'clean' x?

# Backprop and update the params:
opt.zero_grad()
loss.backward()
opt.step()

# Store the loss for later
losses.append(loss.item())

# Print our the average of the loss values for this epoch:
avg_loss = sum(losses[-len(train_dataloader):])/len(train_dataloader)
print(f'Finished epoch {epoch}. Average loss for this epoch: {avg_loss:05f}')

# Plot losses and some samples
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

# Losses
axs[0].plot(losses)
axs[0].set_ylim(0, 0.1)
axs[0].set_title('Loss over time')

# Samples
n_steps = 40
x = torch.rand(64, 1, 28, 28).to(device)
for i in range(n_steps):
noise_amount = torch.ones((x.shape[0], )).to(device) * (1-(i/n_steps)) # Starting high going low
with torch.no_grad():
pred = net(x, 0).sample
mix_factor = 1/(n_steps - i)
x = x*(1-mix_factor) + pred*mix_factor

axs[1].imshow(torchvision.utils.make_grid(x.detach().cpu(), nrow=8)[0].clip(0, 1), cmap='Greys')
axs[1].set_title('Generated Samples');
# 輸出
Finished epoch 0. Average loss for this epoch: 0.020033
Finished epoch 1. Average loss for this epoch: 0.013243
Finished epoch 2. Average loss for this epoch: 0.011795

可以看出,比BasicUNet網絡生成的結果要好一些。

DDPM原理

論文名稱:《Denoising Diffusion Probabilistic Models》

論文地址:https://arxiv.org/pdf/2006.11239.pdf

? ? ? 下面是DDPM論文中的公式,Training步驟其實是退化過程,給原始圖像逐漸添加噪聲的過程,預測目標是擬合每個時間步的采樣噪聲。

  還有一點非常重要:我們都知道在前向過程中是不斷添加噪聲的,其實這個噪聲的系數不是固定的,而是與時間t線性增加的(也成為擴散率),這樣的好處是在后向過程開始過程先把”明顯”的噪聲給去除,對應著較大的擴散率;當去到一定程度,逐漸逼近真實真實圖像的時候,去噪速率逐漸減慢,開始微調,也就是對應著較小的擴散率。

下面我們使用代碼來看一下輸入數據與噪聲在不同迭代周期的變化:

noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
plt.plot(noise_scheduler.alphas_cumprod.cpu() ** 0.5, label=r"${
\sqrt{\bar{\alpha}_t}}$")
plt.plot((1 - noise_scheduler.alphas_cumprod.cpu()) ** 0.5,
label=r"$\sqrt{(1 - \bar{\alpha}_t)}$")
plt.legend(fontsize="x-large");

生成的結果,如下圖所示:

?下面我們來看一下,噪聲系數不變與DDPM中的噪聲方式在MNIST數據集上的加噪效果:

# 可視化:DDPM加噪過程中的不同時間步
# 對一批圖片加噪,看看效果
fig, axs = plt.subplots(3, 1, figsize=(16, 10))
xb, yb = next(iter(train_dataloader))
xb = xb.to(device)[:8]
xb = xb * 2. - 1. # 映射到(-1,1)
print('X shape', xb.shape)
 
# 展示干凈的原始輸入
axs[0].imshow(torchvision.utils.make_grid(xb[:8])[0].detach().
cpu(), cmap='Greys')
axs[0].set_title('Clean X')
 
# 使用調度器加噪
timesteps = torch.linspace(0, 999, 8).long().to(device)
noise = torch.randn_like(xb) # <<注意是使用randn而不是rand
noisy_xb = noise_scheduler.add_noise(xb, noise, timesteps)
print('Noisy X shape', noisy_xb.shape)
 
# 展示“帶噪”版本(使用或不使用截斷函數clipping)
axs[1].imshow(torchvision.utils.make_grid(noisy_xb[:8])[0].
detach().cpu().clip(-1, 1), cmap='Greys')
axs[1].set_title('Noisy X (clipped to (-1, 1))')
axs[2].imshow(torchvision.utils.make_grid(noisy_xb[:8])[0].
detach().cpu(), cmap='Greys')axs[2].set_title('Noisy X');
X shape torch.Size([8, 1, 28, 28])
Noisy X shape torch.Size([8, 1, 28, 28])

結果如下圖所示:

采樣補充

       采樣在擴散模型中扮演非常重要的角色,我們可以輸入純噪聲,然后期待模型能一步輸出不帶噪聲的圖像嗎?根據前面的所學內容,這顯然行不通。那么針對采樣會有哪些改進的思路呢?

文章轉自微信公眾號@ArronAI

上一篇:

Langchain-Chatchat對話不完整問題分析與解決方案

下一篇:

隨機森林算法詳解:原理、特點、生成過程及Python實現指南
#你可能也喜歡這些API文章!

我們有何不同?

API服務商零注冊

多API并行試用

數據驅動選型,提升決策效率

查看全部API→
??

熱門場景實測,選對API

#AI文本生成大模型API

對比大模型API的內容創意新穎性、情感共鳴力、商業轉化潛力

25個渠道
一鍵對比試用API 限時免費

#AI深度推理大模型API

對比大模型API的邏輯推理準確性、分析深度、可視化建議合理性

10個渠道
一鍵對比試用API 限時免費