31일차. Deep Learning - Vision(Image Preprocessing & CNN)

31일 차 회고.

단위 프로젝트를 진행하는데 나름대로 데이터 전처리를 실행하고 XGBoost 모델을 사용했는데도 test score이 80도 나오지 않았다. 다른 팀원들도 최대로 잘 나온 게 80 초반이라고 해서 수요일까지 전처리에 시간을 더 써서 해보기로 했다. 데이터 분석 쪽으로 취업을 희망하고 있지만 너무 어렵고 제대로 알지 못하는 것 같아서 막막하다.

1. Deep Learning - Image Preprocessing

Torchvision

Classic Image Processing Algorithm

형태학적 이미지 처리(Morphological Image Processing)
- 확장(Dilation): 이미지에서 객체의 경계에 픽셀 추가
- 침식(Erosion): 이미지에서 객체의 경계에 픽셀 제거
가우시안 평활화(Gaussian Image Processing)
- 가우시안 함수로 이미지 블러링
엣지 검출(Edge Detection in Image Processing)
- 이미지 내 객체의 경계 탐색
- 밝기의 불연속성 감지

Image Processing using Nerual Networks

컨볼루션 신경망(CNN; Convolutional Neural Network)
- Convolutional Layer
- Pooling Layer
- Fully Connected Layer

Image Augmentation

이미지가 로드됐을 때 변환된다.

from PIL import Image

import numpy as np

import torch
from torchvision import transforms as T

ori_img = Image.open('cat.jpg')

ori_img

def plot(ori_img, imgs:list, with_ori:bool=True, row_title:str=None, **imshow_kwargs):
    if not isinstance(imgs[0], list):		# isinstance(A, B) -> type(A) == type(B) 비교
        imgs = [imgs]
    
    num_rows = len(imgs)
    num_cols = len(imgs[0]) + with_ori
    
    fig, axs = plt.subplots(
        nrows=num_rows,
        ncols=num_cols,
        squeeze=False,
        figsize=(10, 15)
    )
    
    for row_idx, row in enumerate(imgs):
        row = [ori_img] + row if with_ori else row
        
        for col_idx, img in enumerate(row):
            ax = axs[row_idx, col_idx]
            ax.imshow(np.asarray(img), **imshow_kwargs)
            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
    
    if with_ori:
        axs[0, 0].set(title='Original Image')
        axs[0, 0].title.set_size(8)
    if row_title is not None:
        for row_idx in range(num_rows):
            axs[row_idx, 0].set(ylabel=row_title[row_idx])
    
    plt.tight_layout()

Padding

T.Pad(padding=50)(ori_img)

padded_imgs = [T.Pad(padding=padding)(ori_img) for padding in (3, 10, 30, 50)]

plot(ori_img, padded_imgs)

Resize

ori_img.size
# (1898, 1266)

resized_imgs = [T.Resize(size=size)(ori_img) for size in (30, 50, 100, ori_img.size[0])]

plot(ori_img, resized_imgs)

resized_imgs = [T.Resize(size=size)(ori_img) for size in ((30, 60), (50, 100), (100, 200))]

plot(ori_img, resized_imgs)

CenterCrop

center_crops = [T.CenterCrop(size=size)(ori_img) for size in (100, 150, 200, ori_img.size[0])]

plot(ori_img, center_crops)

center_crops = [T.CenterCrop(size=size)(ori_img) for size in ((100, 150), (150, 200), (200, 250))]

plot(ori_img, center_crops)

FiveCrop

(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(500, 500))(ori_img)

plot(ori_img, [top_left, top_right, bottom_left, bottom_right, center])

Grayscale

gray_img = T.Grayscale()(ori_img)

plot(ori_img, [gray_img], cmap='gray')

ColorJitter

jitter = T.ColorJitter(brightness=.5, hue=.3)
jitted_imgs = [jitter(ori_img) for _ in range(4)]

plot(ori_img, jitted_imgs)

GaussianBlur

blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
blurred_imgs = [blurrer(ori_img) for _ in range(4)]

plot(ori_img, blurred_imgs)

RandomPerspective

perspective_transformer = T.RandomPerspective(distortion_scale=0.5, p=1.0)
perspective_imgs = [perspective_transformer(ori_img) for _ in range(4)]

plot(ori_img, perspective_imgs)

RandomRotation

rotater = T.RandomRotation(degrees=(0, 180))
rotated_imgs = [rotater(ori_img) for _ in range(4)]

plot(ori_img, rotated_imgs)

RandomAffine

affine_transformer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
affine_imgs = [affine_transformer(ori_img) for _ in range(4)]

plot_ori_img, affine_imgs)

PIL

Image Class

from PIL import Image

img = Image.open('deer.jpg')

img

img.mode
# 'RGB'

logo = Image.open('logo_pillow.png')

logo

logo.mode
# 'RGBA'

logo.size
# (216, 73)

import numpy as np

np.array(logo).shape
# (73, 216, 4)

Image Manipulation

crop

box = (100, 150, 300, 300)
cropped_image = img.crop(box)

cropped_image

rotate

rotated_image = img.rotate(180)

rotated_image

merge - copy & paste

img.mode, logo.mode
# 'RGB', 'RGBA'

np.array(img).shape, np.array(logo).shape
# ((471, 589, 3), (73, 216, 4))

np.array(img).shape, np.array(logo)[:, :, :3].shape
# ((471, 589, 3), (73, 216, 3))

tmp_img = img.copy()

position = (40, 350)
tmp_img.paste(logo, position)

tmp_img

convert

img.convert('L')

Image Enhancement

from PIL import ImageEnhance

Sharpness

enhancer = ImageEnhance.Sharpness(img)
enhancer.enhance(10.0)

Contrast

enhancer = ImageEnhance.Contrast(img)
enhancer.enhance(2)

ImageFilter

from PIL import ImageFilter

img.filter(ImageFilter.BLUR)

img.filter(ImageFilter.CONTOUR)

img.filter(ImageFilter.BLUR).filter(ImageFilter.FIND_EDGES)

Save Image

물리적으로 image를 저장할 수 있다.

save_img = img.filter(ImageFilter.BLUR).filter(ImageFilter.FIND_EDGES)
save_img.save('save_img.jpg')

2. Deep Learning - Vision

CNN

Import Module

import torch
from torch import nn

from torch.utils.data import DataLoader

import torchvision
from torchvision import datasets, transforms

import matplotlib.pyplot as plt

from tqdm.auto import tqdm

!pip install torchinfo

import torchinfo

import os
import random
import numpy as np
import pandas as pd

SEED = 42

def reset_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

reset_seeds(SEED)

def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    accuracy = (correct / len(y_pred) * 100
    
    return accuracy

def eval_model(model:nn.Module, device:str, dataloader:DataLoader,
               loss_fn:nn.Module, accuracy_fn) -> dict:
    model.to(device)
    model.eval()
    
    loss, accuracy = 0, 0
    
    with torch.inference_mode():
        for feature, target in tqdm(dataloader, desc='Eval Step'):
            feature, target = feature.to(device), target.to(device)
            
            pred = model(feature)
            
            loss += loss_fn(pred, target)
            accuracy += accuracy_fn(y_true=target, y_pred=pred.argmax(dim=1))
        
        loss /= len(dataloader)
        accuracy /= len(dataloader)
    
    return {
        'model': model.__class__.__name__,
        'loss': loss.item(),
        'accuracy': accuracy
    }

def train_step(model:nn.Module, device:str, dataloader:DataLoader,
               loss_fn:nn.Module, accuracy_fn, optimizer:torch.optim.Optimizer):
    model.to(device)
    model.train()
    
    train_loss, train_accuracy = 0, 0
    
    for feature, target in tqdm(dataloader, desc='Train Step', leave=False):
        feature, target = feature.to(device), target.to(device)
        
        pred = model(feature)
        
        loss = loss_fn(pred, target)
        train_loss += loss
        train_accuracy += accuracy_fn(y_true=target, y_pred=pred.argmax(dim=1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    train_loss /= len(dataloader)
    train_accuracy /= len(dataloader)
    
    return train_loss, train_accuracy

def test_step(model:nn.Module, device:str, dataloader:DataLoader, loss_fn:nn.Module, accuracy_fn):
    model.to(device)
    model.eval()
    
    loss, accuracy = 0, 0
    
    with torch.inference_mode():
        for feature, target in tqdm(dataloader, desc='Test Step', leave=False):
            feature, target = feature.to(device), target.to(device)
            
            pred = model(feature)
            
            loss += loss_fn(pred, target)
            accuracy += accuracy_fn(y_true=target, y_pred=pred.argmax(dim=1))
        
        loss /= len(dataloader)
        accuracy /= len(dataloader)
    
    return loss, accuracy

Data

import easydict

args = easydict.EasyDict()

from google.colab import drive

drive.mount('/content/data')

args.data_path = ''
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'

from pathlib import Path

data_path = Path(args.data_path)

train_path = data_path / 'train'
test_path = data_path / 'test'

train_imgs = list(train_path.glob('*/*.jpg'))

from torchvision import transforms as T

simple_transform = T.Compose([
    T.Resize((64, 64)),
    T.RandomPerspective(distortion_scale=0.6, p=1.0),
    T.RandomRotation(degrees=(0, 180)),
    T.ToTensor()
])

train_dataset = datasets.ImageFolder(
    root=train_path,
    transform=simple_transform
)

test_dataset = datasets.ImageFolder(
    root=test_path,
    transform=simple_transform
)

args.batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    shuffle=True
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=args.batch_size,
    shuffle=True
)

CNN Model

class CNNModel(nn.Module):
    def __init__(self, input_shape, target_shape, hidden_units=32) -> None:
        super().__init__()
        
        self.block_1 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape,
                      out_channels=hidden_units,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.block_1 = nn.Sequential(
            nn.Conv2d(in_channels=hidden_units,
                      out_channels=hidden_units*2,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=hidden_units*2*16*16,
                      out_features=hidden_units*2*16*16),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units*2*16*16,
                      out_features=target_shape)
        )
    
    def forward(self, x):		# x -> (batch(=32), color(=3), row(=64), column(=64))
        out = self.block_1(x)		# out -> (batch(=32), feature_map(=32), row(=32), column(=32))
        out = self.block_2(out)		# out -> (batch(=32), feature_map(=64), row(=16), column(=16))
        return self.classifier(out)	# out -> (batch(=32), target_size(=32*2*16*16))

cnn_model = CNNModel(input_shape=3, output_shape=3).to(args.device)
input_size = (32, 3, 64, 64)

torchinfo.summary(cnn_model, input_size, col_names=['kernel_size', 'input_size', 'output_size', 'num_params'])
# ============================================================================================================================================
# Layer (type:depth-idx)                   Kernel Shape              Input Shape               Output Shape              Param #
# ============================================================================================================================================
# CNNModel                                 --                        [32, 3, 64, 64]           [32, 3]                   --
# ├─Sequential: 1-1                        --                        [32, 3, 64, 64]           [32, 32, 32, 32]          --
# │    └─Conv2d: 2-1                       [3, 3]                    [32, 3, 64, 64]           [32, 32, 64, 64]          896
# │    └─ReLU: 2-2                         --                        [32, 32, 64, 64]          [32, 32, 64, 64]          --
# │    └─MaxPool2d: 2-3                    2                         [32, 32, 64, 64]          [32, 32, 32, 32]          --
# ├─Sequential: 1-2                        --                        [32, 32, 32, 32]          [32, 64, 16, 16]          --
# │    └─Conv2d: 2-4                       [3, 3]                    [32, 32, 32, 32]          [32, 64, 32, 32]          18,496
# │    └─ReLU: 2-5                         --                        [32, 64, 32, 32]          [32, 64, 32, 32]          --
# │    └─MaxPool2d: 2-6                    2                         [32, 64, 32, 32]          [32, 64, 16, 16]          --
# ├─Sequential: 1-3                        --                        [32, 64, 16, 16]          [32, 3]                   --
# │    └─Flatten: 2-7                      --                        [32, 64, 16, 16]          [32, 16384]               --
# │    └─Linear: 2-8                       --                        [32, 16384]               [32, 16384]               268,451,840
# │    └─ReLU: 2-9                         --                        [32, 16384]               [32, 16384]               --
# │    └─Linear: 2-10                      --                        [32, 16384]               [32, 3]                   49,155
# ============================================================================================================================================
# Total params: 268,520,387
# Trainable params: 268,520,387
# Non-trainable params: 0
# Total mult-adds (Units.GIGABYTES): 9.32
# ============================================================================================================================================
# Input size (MB): 1.57
# Forward/backward pass size (MB): 54.53
# Params size (MB): 1074.08
# Estimated Total Size (MB): 1130.18
# ============================================================================================================================================

Training

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(
    params=cnn_model.parameters(),
    lr=0.1
)

epochs = 10

train_loss_values = []
test_loss_values = []

for epoch in tqdm(range(epochs), desc='Epoch'):
    train_loss, train_acc = train_step(
        model=cnn_model,
        device=args.device,
        dataloader=train_dataloader,
        loss_fn=loss_fn,
        accuracy_fn=accuracy_fn,
        optimizer=optimizer
    )
    train_loss_values.append(train_loss.detach().numpy())
    
    test_loss, test_acc = train_step(
        model=cnn_model,
        device=args.device,
        dataloader=test_dataloader,
        loss_fn=loss_fn,
        accuracy_fn=accuracy_fn
    )
    test_loss_values.append(test_loss.detach().numpy())

plt.plot(train_loss_values, label='Train Loss')
plt.plot(test_loss_values, label='Test Loss')

plt.title('Training and Test Loss Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글

33일차. Deep Learning - Vision(Fine Tuning) (0)	2025.02.26
32일차. Deep Learning - Vision(CNN & Fine Tuning) (0)	2025.02.25
30일차. PyTorch - Multiclass Classification (0)	2025.02.21
29일차. PyTorch - Binary Classification & Multiclass Classification (0)	2025.02.20
28일차. PyTorch - Model Layers & Regression (0)	2025.02.19

이네의 개발 노트

31일차. Deep Learning - Vision(Image Preprocessing & CNN)

1. Deep Learning - Image Preprocessing

Torchvision

PIL

2. Deep Learning - Vision

CNN

'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글

티스토리툴바

31일차. Deep Learning - Vision(Image Preprocessing & CNN)

1. Deep Learning - Image Preprocessing

Torchvision

PIL

2. Deep Learning - Vision

CNN

'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글

'SK네트웍스 Family AI캠프 10기/Daily 회고' Related Articles

티스토리툴바