본문 바로가기

SK네트웍스 Family AI캠프 10기/Daily 회고

34일차. Deep Learning - Modular & TensorBoard & HPO Tuning

더보기

 

34일 차 회고.

 

 오늘은 대부분이 그동안 배운 것들을 모듈화해 보는 수업이라 괜찮았던 것 같다. 그런데 단위 프로젝트를 하던 중에 어제까지는 PyCaret이 잘 실행됐는데 오늘 와서 다시 실행해 보니까 자꾸 오류가 떠서 답답했다. 해결을 해보려고 해도 안 돼서 그냥 내가 직접 모델을 하나씩 돌려서 비교해 보기로 했다.

 

 

 

 

1. Modular

 

 

Visual Stuido Code

 

가상환경 생성 및 모듈 설치

py -3.12 -m venv .venv
.\.venv\Scripts\activate
python -m pip install --upgrade pip
pip install torch torchvision torchinfo
pip install jupyter matplotlib tqdm

pip freeze > requirements.txt

# 타인의 requirements.txt 파일을 통해 가상환경을 생성할 경우
# py -3.12 -m venv .venv
# .\.venv\Scripts\activate
# python -m pip install --upgrade pip
# pip install -r requirements.txt

requirements.txt
0.00MB

 

파일 구조

learning_modular/
├── service/
│   ├── models/
    │   └── vgg_model.py
│   ├── data_setup.py		# a file to prepare and download data if needed
│   ├── engine.py		# a file containing various training functions
│   ├── model_builder.py	# a file to create a PyTorch model
│   ├── train.py		# a file to leverage all other files and train a target PyTorch model
│   └── utils.py		# a file dedicated to helpful utility functions
├── models/
│   ├── LearningModular.pth
├── data/
│   └── pizza_steak_sushi/
│       ├── train/
│       │   ├── pizza/
│       │   │   ├── image01.jpeg
│       │   │   └── ...
│       │   ├── steak/
│       │   └── sushi/
│       └── test/
│           ├── pizza/
│           ├── steak/
│           └── sushi/
└── test.ipynb

 

 

Service/

 

data_setup.py

import os
from pathlib import Path

from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader

def create_dataset(
        root:str='./data/pizza_steak_sushi',
        train:bool=True,
        transform:transforms.Compose=None
    ) -> Dataset:
    if train:
        root = Path(root) / 'train'
    else:
        root = Path(root) / 'test'
    
    return datasets.ImageFolder(root=root, transform=transform)

def create_dataloader(
        dataset:Dataset,
        batch_size:int=32,
        shuffle:bool=True
    ):
    return DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle
    )
# test.ipynb

from torchvision import transforms
from service.data_setup import create_dataset, create_dataloader

my_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

train_dataset = create_dataset(train=True, transform=my_transform)
test_dataset = create_dataset(train=False, transform=my_transform)
len(train_dataset), len(test_dataset)
# (225, 75)

train_dataloader = create_dataloader(train_dataset)
test_dataloader = create_dataloader(test_dataset)
len(train_dataloader), len(test_dataloader)
# (8, 3)

features, targets = next(iter(train_dataloader))
features.shape
# torch.Size([32, 3, 64, 64])

 

models/vgg_model.py

import torch
from torch import nn

class VGGBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super().__init__()
        self.block = nn.Sequential(
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size
            ),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
    
    def forward(self, x):
        return self.block(x)

class ClassifierBlock(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.block = nn.Sequential(
            nn.Flatten(),
            nn.Linear(
                in_features=in_features,
                out_features=out_features
            )
        )
    
    def forward(self, x):
        return self.block(x)

class TinyVGG(nn.Module):
    def __init__(self, color_size, target_size, hidden_size=10):
        super().__init__()
        self.vgg1 = VGGBlock(in_channels=color_size, out_channels=hidden_size)
        self.vgg2 = VGGBlock(in_channels=hidden_size, out_channels=hidden_size)
        self.classifier = ClassifierBlock(in_features=hidden_size*13*13, out_features=target_size)
    
    def forward(self, x):
        vgg1_out = self.vgg1(x)
        vgg2_out = self.vgg2(vgg1_out)
        return self.classifier(vgg2_out)
# test.ipynb

import torch
from service.models.vgg_model import VGGBlock, ClassifierBlock, TinyVGG

# Debugging
input_data = torch.rand(size=(10, 3, 64, 64))
input_data.shape
# torch.Size([10, 3, 64, 64])

vgg1 = VGGBlock(in_channels=3, out_channels=10)
vgg1_out = vgg1(input_data)
vgg1_out.shape
# torch.Size([10, 10, 30, 30])

vgg2 = VGGBlock(in_channels=10, out_channels=10)
vgg2_out = vgg1(vgg1_out)
vgg2_out.shape
# torch.Size([10, 10, 13, 13])

# Test
import torch
from service.models.vgg_model import TinyVGG

model = TinyVGG(color_size=3, target_size=3)

input_data = torch.randn(size=(64, 3, 64, 64))
input_data.shape
# torch.Size([64, 3, 64, 64])

pred = model(input_data)
pred.shape
# torch.Size([64, 3])

 

model_builder.py

from models.vgg_model import TinyVGG

def create_model(
        color_size,
        target_size,
        is_trained:bool=False
):
    if is_trained:
        return
    
    return TinyVGG(color_size=color_size, target_size=target_size)
# test.ipynb

from service.model_builder import create_model

my_vgg = create_model(color_size=3, target_size=3)

 

engine.py

import torch
from tqdm.auto import tqdm
from typing import Dict, List, Tuple

from utils import EarlyStopper

def train_step(
        model: torch.nn.Module,
        dataloader:torch.utils.data.DataLoader,
        loss_fn: torch.nn.Module,
        optimizer: torch.optim.Optimizer,
        device: torch.device, 
        valid_fn=None
) -> Tuple[float, float]:

    model.train()

    train_loss, train_valid = 0, 0

    for _, (X, y) in tqdm(enumerate(dataloader), desc='Train Loop', leave=False, total=len(dataloader)):
        X, y = X.to(device), y.to(device)

        y_pred = model(X)

        loss = loss_fn(y_pred, y)
        train_loss += loss.item()

        if valid_fn:
            train_valid += valid_fn(y_pred.argmax(dim=1).cpu().numpy(), y.cpu().numpy())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    train_valid = train_valid / len(dataloader)
    
    return train_loss, train_valid

def test_step(
        model: torch.nn.Module,
        dataloader: torch.utils.data.DataLoader,
        loss_fn: torch.nn.Module,
        device: torch.device,
        valid_fn=None
) -> Tuple[float, float]:
    
    model.eval()

    test_loss, test_valid = 0, 0

    with torch.inference_mode():
        for _, (X, y) in tqdm(enumerate(dataloader), desc='Testing Loop', leave=False, total=len(dataloader)):
            X, y = X.to(device), y.to(device)

            test_pred_logits = model(X)

            loss = loss_fn(test_pred_logits, y)
            test_loss += loss.item()

            if valid_fn:
                test_valid += valid_fn(test_pred_logits.argmax(dim=1).cpu().numpy(), y.cpu().numpy())

    test_loss = test_loss / len(dataloader)
    test_valid = test_valid / len(dataloader)
    
    return test_loss, test_valid


def train(
        model: torch.nn.Module,
        train_dataloader: torch.utils.data.DataLoader,
        test_dataloader: torch.utils.data.DataLoader,
        optimizer: torch.optim.Optimizer,
        loss_fn: torch.nn.Module,
        epochs: int,
        device: torch.device,
        early_stopper:EarlyStopper,
        valid_fn=None
) -> Dict[str, List]:

    results = {
        'train_loss': [],
        'train_valid': [],
        'test_loss': [],
        'test_valid': []
    }

    for epoch in tqdm(range(epochs), desc='Epoch Loop', leave=True):
        train_loss, train_valid = train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device,
            valid_fn=valid_fn
        )
        
        test_loss, test_valid = test_step(
            model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn,
            device=device, valid_fn=valid_fn
        )

        print(
            f'Epoch: {epoch+1} | '
            f'train_loss: {train_loss:.4f} | '
            f'test_loss: {test_loss:.4f} | '
        )

        results['train_loss'].append(train_loss)
        results['test_loss'].append(test_loss)
        
        if valid_fn:
            results['train_valid'].append(train_valid)
            results['test_valid'].append(test_valid)

        if not early_stopper.is_continuable(model=model, loss=test_loss):
            break

    return results

 

utils.py

import torch 
import numpy as np 

class EarlyStopper(object):

    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_loss = np.inf
        self.save_path = save_path

    def is_continuable(self, model, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.trial_counter = 0
            torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False

    def get_best_model(self, device):
        return torch.load(self.save_path).to(device)

 

train.py

import argparse

from torchvision import transforms

from data_setup import create_dataset, create_dataloader
from model_builder import create_model
from engine import train
from utils import EarlyStopper

def main(args):
    transform = transforms.Compose([
        transforms.Resize((64, 64)),
        transforms.ToTensor()
    ])
    
    train_dataset = create_dataset(train=True, transform=transform)
    test_dataset = create_dataset(train=False, transform=transform)
    
    train_dataloader = create_dataloader(dataset=train_dataset, batch_size=args.batch_size)
    test_dataloader = create_dataloader(dataset=test_dataset, batch_size=args.batch_size)
    
    feature, _ = train_dataset[0]
    
    vgg_model = create_model(color_size=feature.shape[0], target_size=len(train_dataset.classes), is_trained=False)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    vgg_model.to(device)
    
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        params=vgg_model.parameters(),
        lr=args.lr
    )
    
    early_stopper = EarlyStopper(num_trials=5, save_path.args.trained_model)
    
    train(
        model=vgg_model,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        optimizer=optimizer,
        loss_fn=loss_fn,
        epochs=args.epochs,
        device=device,
        early_stopper=early_stopper
    )

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--lf', default=0.01, type=float)
    parser.add_argument('--epochs', default=50, type=int)
    parser.add_argument('--trained_model', default='./models/trained_model.pth')
    
    args = parser.parse_args()
    
    main(args)

 

실행

python .\service\train.py

 

 

 

TensorBoard

 

 

TensorBoard

%matplotlib inline

import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()		# 'runs' directory 생성 -> log data 기록

x = torch.arange(-5, 5, 0.1).view(-1, 1)
y = -5 * x + 0.1 * torch.randn(x.size())

model = torch.nn.Linear(1, 1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

def train_model(iter):
    for epoch in range(iter):
        y1 = model(x)
        loss = criterion(y1, y)
        
        writer.add_scalar('Loss/sample', loss, epoch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

train_model(10)

writer.flush()				# log data 저장

writer.close()				# Writer 종료

%load_ext tensorboard
%tensorboard --logdir=runs

 

HPO Tuning

 

 

import torch
import matplotlib.pyplot as plt

 

 

Learning Rate Scheduler

 

StepLR

  • 특정 step에 따라 learning rate를 감소시키는 scheduler
    • 일정한 step 마다 learning rate에 gamma를 곱하는 방식
model = torch.nn.Linear(2, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=100)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)

lrs = []

for i in range(10):
    optimizer.step()					# model parameter 업데이트
    lrs.append(optimizer.param_groups[0]['lr'])
    scheduler.step()					# lr 값 수정

plt.plot(range(10), lrs)

 

CyclicLR - triangular2

model = torch.nn.Linear(2, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.CyclicLR(
    optimizer,
    base_lr=0.001,
    max_lr=0.1,
    step_size_up=5,
    mode='triangular2'
)

lrs = []

for i in range(100):
    optimizer.step()
    lrs.append(optimizer.param_groups[0]['lr']
    scheduler.step()

plt.plot(lrs)

 

 

Ray Tune

 

def main():
    data_dir = os.path.abspath('./load_data')
    load_data(data_dir)
    
    config = {
        'l1': tune.choice([2**i for i in range(9)]),
        'l2': tune.choice([2**i for i in range(9)]),
        'lr': tune.loguniform(1e-4, 1e-1),
        'batch_size': tune.choice([2, 4, 8, 16])
    }
    
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',				# loss는 min / accuracy는 max
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2
    )
    
    repoter = tune.JupyterNotebookReporter(
        metric_columns=['loss', 'accuracy', 'training_iteration'])
    
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={'cpu': 2, 'gpu': gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        trial_dirname_creator=trial_str_creator
    )
    
    return result