학습 결과를 공유하고 싶다면?

학습 결과를 저장할 필요가 있다.

model.save()

학습의 결과를 저장하기 위한 함수
모델 형태(architecture)와 파라미터를 저장
- 파라미터만 저장하면 적은 용량으로 저장 가능
모델 학습 중간 과정의 저장을 통해 최선의 결과모델을 선택
만들어진 모델을 외부 연구자와 공유하여 학습 재연성 향상

import torch
import torch.nn as nn
import torch.optim as optim
import os

class TheModelClass(nn.Module):
    def __init__(self):
        super(TheModelClass, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=0),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(3 * 3 * 64, 1000)
        self.fc2 = nn.Linear(1000, 1)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)

        out = out.view(out.size(0), -1)
        out = self.drop_out(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

# Initialize model
model = TheModelClass().cuda()

# Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

'''
Model's state_dict:
layer1.0.weight      torch.Size([16, 3, 3, 3])
layer1.0.bias      torch.Size([16])
layer1.1.weight      torch.Size([16])
layer1.1.bias      torch.Size([16])
layer1.1.running_mean      torch.Size([16])
layer1.1.running_var      torch.Size([16])
layer1.1.num_batches_tracked      torch.Size([])
layer2.0.weight      torch.Size([32, 16, 3, 3])
layer2.0.bias      torch.Size([32])
layer2.1.weight      torch.Size([32])
layer2.1.bias      torch.Size([32])
layer2.1.running_mean      torch.Size([32])
layer2.1.running_var      torch.Size([32])
layer2.1.num_batches_tracked      torch.Size([])
layer3.0.weight      torch.Size([64, 32, 3, 3])
layer3.0.bias      torch.Size([64])
layer3.1.weight      torch.Size([64])
layer3.1.bias      torch.Size([64])
layer3.1.running_mean      torch.Size([64])
layer3.1.running_var      torch.Size([64])
layer3.1.num_batches_tracked      torch.Size([])
fc1.weight      torch.Size([1000, 576])
fc1.bias      torch.Size([1000])
fc2.weight      torch.Size([1, 1000])
fc2.bias      torch.Size([1])
'''

from torchsummary import summary
summary(model, (3, 224, 224))

'''
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 16, 111, 111]             448
       BatchNorm2d-2         [-1, 16, 111, 111]              32
              ReLU-3         [-1, 16, 111, 111]               0
         MaxPool2d-4           [-1, 16, 55, 55]               0
            Conv2d-5           [-1, 32, 27, 27]           4,640
       BatchNorm2d-6           [-1, 32, 27, 27]              64
              ReLU-7           [-1, 32, 27, 27]               0
         MaxPool2d-8           [-1, 32, 13, 13]               0
            Conv2d-9             [-1, 64, 6, 6]          18,496
      BatchNorm2d-10             [-1, 64, 6, 6]             128
             ReLU-11             [-1, 64, 6, 6]               0
        MaxPool2d-12             [-1, 64, 3, 3]               0
          Dropout-13                  [-1, 576]               0
           Linear-14                 [-1, 1000]         577,000
           Linear-15                    [-1, 1]           1,001
================================================================
Total params: 601,809
Trainable params: 601,809
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 5.53
Params size (MB): 2.30
Estimated Total Size (MB): 8.40
----------------------------------------------------------------
'''

MODEL_PATH ="saved"
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
# state_dict() : 모델의 파라미터를 의미
torch.save(model.state_dict(), os.path.join(MODEL_PATH, "model.pt"))

# 모델의 파라미터를 저장
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_model = TheModelClass() # 불러올 때 동일한 모델이어야 함
# 파라미터 불러오기(같은 모델의 형태에서만 가능)
new_model.load_state_dict(torch.load(os.path.join(MODEL_PATH, "model.pt")))

# 모델의 architecture와 함께 저장
torch.save(model, os.path.join(MODEL_PATH, "model_pickle.pt"))  # 모델 구조 자체를 pickle 형태로 저장할 수도 있음
# 모델의 architecture와 함께 불러오기
model = torch.load(os.path.join(MODEL_PATH, "model_pickle.pt"))

보통 파라미터만 저장하고 불러오는 방식을 더 많이 사용
모델 구조까지 저장하고 불러오는 방식은 코드까지(?) 공유가 되어야 해서 번거로움

checkpoints

학습의 중간 결과를 저장하여 최선의 결과를 선택
ealrystopping 기법 사용시 이전 학습의 결과물을 저장
loss와 metric 값을 지속적으로 확인 저장
일반적으로 epoch, loss, metric을 함께 저장하여 확인
colab에서 지속적인 학습을 위해 필요

for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device).type(torch.cuda.FloatTensor)

        optimizer.zero_grad()        
        y_pred = model(X_batch)

        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()


    torch.save({
        'epoch': e,    # 모델의 정보를 epoch과 함께 저장
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': epoch_loss,
        }, f"saved/checkpoint_model_{e}_{epoch_loss/len(dataloader)}_{epoch_acc/len(dataloader)}.pt")


    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(dataloader):.5f} | Acc: {epoch_acc/len(dataloader):.3f}')

# 사용 예시
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

Transfer learning

다른 데이터셋으로 만든 모델을 현재 데이터에 적용
일반적으로 대용량 데이터셋으로 만들어진 모델의 성능 ↑
현재의 DL에서는 가장 일반적인 학습 기법
backbone architecture가 잘 학습된 모델에서 일부분만 변경하여 학습을 수행함
CV는 TorchVision, NLP는 HuggingFace가 사실상 표준

Freezing

pretrained model을 활용시 모델의 일부분을 frozen 시키고 학습

from torch import nn
from torchvision import models

class MyNewNet(nn.Module):   
    def __init__(self):
        super(MyNewNet, self).__init__()
        self.vgg19 = models.vgg19(pretrained=True)
        self.linear_layers = nn.Linear(1000, 1)    # 모델의 마지막에 linear layer 추가


    # Defining the forward pass    
    def forward(self, x):
        x = self.vgg19(x)        
        return self.linear_layers(x)

my_model = MyNewNet()
my_model = my_model.to(device)

for param in my_model.parameters():    # 마지막에 추가했던 linear layer를 제외한 나머지 layer들은 gradient 계산 안하도록 설정(frozen)
    param.requires_grad = False
for param in my_model.linear_layers.parameters():    # 마지막에 추가한 linear layer는 gradient 계산하도록 설정
    param.requires_grad = True

모델 저장할 때 pth 확장자명은 안쓰는 것을 추천
- pth가 파이썬에서 별도의 확장자로 사용하는 확장자명임