본문 바로가기

SK네트웍스 Family AI캠프 10기/Daily 회고

27일차. PyTorch - Dataset

더보기

 

27일 차 회고.

 

 오늘 배운 양은 적은 것 같은데 그만큼 중요한 내용이 많았다. 하지만 오늘 특히 피곤한 날이어서 수업이 너무 힘들었다.

 

 

 

 

1. PyTorch

 

 

1-1. Load and Normalize Datasets

 

Dataset and DataLoader

# 데이터 분석 모듈
import numpy

# 딥러닝 모듈
import torch
# 데이터 처리 모듈
from torch.utils.data import Dataset
# 이미지 모델 모듈
from torchvision import datasets
# 데이터 변환 모듈
from torchvision.transforms import ToTensor, Lambda

# 시각화 모듈
import matplotlib.pyplot as plt
%matplotlib inline

 

Torch Dataset

# FashionMNIST
training_data = datasets.FashionMNIST(
    root='fashion_data',		# 저장 폴더명
    train=True,				# Training Dataset
    download=True			# Dataset이 이미 다운로드 되어 있다면, 다운로드 생략
)

test_data = datasets.FashionMNIST(
    root='fashion_data',
    train=False,
    download=True,
    transform=ToTensor()
)

type(training_data), type(test_data)
# (torchvision.datasets.mnist.FashionMNIST,
#  torchvision.datasets.mnist.FashionMNIST)

len(training_data), len(test_data)
# (60000, 10000)

features, target = training_data[0]
np.array(features).shape, target
# ((28, 28), 9)
# Iterate and Visualize the Dataset
labels_map = {
    0: 'T-shirt/top',
    1: 'Trouser',
    2: 'Pullover',
    3: 'Dress',
    4: 'Coat',
    5: 'Sandal',
    6: 'Shirt',
    7: 'Sneaker',
    8: 'Bag',
    9: 'Ankle boot'
}

figure = plt.figure(figsize=(8, 8)
cols, rows = 3, 3

for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    
    img, label = training_data[sample_idx]
    
    figure.add_subplot(rows, cols, i)
    
    plt.title(labels_map[label])
    plt.axis("off")
    plt.imshow(img, cmap="gray")

plt.show()

# Prepare Data for Training with DataLoader
from torch.utils.data import DataLoader

# 시계열 데이터를 제외한 모든 데이터는 'shuffle=True' 필요
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

len(test_data), len(test_dataloader)
# (10000, 157)
# Iterate through the DataLoader
type(next(iter(test_dataloader))), len(next(iter(test_dataloader)))
# (list, 2)

test_features, test_labels = next(iter(test_dataloader))

type(test_features), type(test_labels)
# (torch.Tensor, torch.Tensor)

test_features.shape
# torch.Size([64, 1, 28, 28])			# (batch, color, row, column)
						# color: 1(흑백) / 3(컬러)

test_labels.shape
# torch.Size([64])

img = test_features[0].squeeze()
label = test_labels[0]

plt.axis("off")
plt.imshow(img, cmap="gray")

plt.show()

 

ImageFolder Dataset

ImageFolder를 사용하기 위해서는 폴더명이 label로 표현되어야 하고, 이미지가 해당 label 폴더 안에 있어야 한다.

# Google Drive
from google.colab import drive

drive.mount('/content/data')

DATA_PATH = ""
# Get Data
import os
from pathlib import Path

data_path = Path(DATA_PATH)
image_path = data_path / "pizza_steak_sushi"

def walk_through_dir(dir_path):
    for dirpath, dirnames, filenames in os.walk(dir_path):
        print(f"There are {len(dirnames)} directories and {len(filenames)} images in {dirpath}")

walk_through_dir(image_path)
# There are 2 directories and 0 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi
# There are 3 directories and 0 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train
# There are 0 directories and 78 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train/pizza
# ...
# There are 0 directories and 25 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test/pizza
# There are 0 directories and 31 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test/sushi
# There are 0 directories and 19 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test/steak

list(image_path.glob("*")
# [PosixPath('/content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train'),
#  PosixPath('/content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test')]

train_path = image_path / "train"
test_path = image_path / "test"
# Visualize an Image
import random
from PIL import Image

train_image_path_list = list(train_path.glob('*/*.jpg'))

random_sample_image_path = random.choice(train_image_path_list)	# choice: 비복원추출 / choices: 복원추출

label = random_sample_image_path.parent.stem
img = Image.open(random_sample_image_path)

label, img.height, img.width
# ('steak', 342, 512')

img

import numpy as np
import matplotlib.pyplot as plt

arr_img = np.asarray(img)

plt.title(label)
plt.imshow(arr_img)
plt.axis(False)

plt.show()

# ImageFolder
from torchvision import datasets

train_dataset = datasets.ImageFolder(
    root=train_path,
    transform=ToTensor()
)

test_dataset = datasets.ImageFolder(
    root=test_path,
    transform=ToTensor()
)

len(train_dataset), len(test_dataset)
# (225, 75)

train_dataset
# Dataset ImageFolder
#     Number of datapoints: 225
#     Root location: /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train
#     StandardTransform
# Transform: ToTensor()

class_names = train_dataset.classes
class_name
# ['pizza', 'steak', 'sushi']

class_dict = train_dataset.class_to_idx
class_dict
# {'pizza': 0, 'steak': 1, 'sushi': 2}

class_name = 'steak'
class_dict[class_name]
# 1

class_names[class_dict[class_name]]
# 'steak'
img, label = train_dataset[0]

img.shape, label
# (torch.Size([3, 512, 512]), 0)			# img: torch.Size([color, row, column])

img_permute = img.permute(1, 2, 0)

plt.title(class_names[label])
plt.imshow(img_permute)
plt.axis(False)

plt.show()

# DataLoader
from torch.utils.data import DataLoader

dl_train = DataLoader(
    dataset=train_dataset,
    batch_size=1,
    shuffle=True
)

len(train_dataset), len(dl_train)
# (225, 225)

imgs, labels = next(iter(dl_train))

imgs.shape, labels.shape
# torch.Size([1, 3, 512, 512]), torch.Size([1]))	# img: torch.Size([batch, color, row, column])

 

Dataset Class

# Create a Dataset
import os

from torch.utils.data import Dataset
from pathlib import Path
from PIL import Image

class CustomDataset(Dataset):			# Dataset 상속
    # __init__: method에서 사용할 attribute 선언
    def __init__(self, data_path:str, transform=None):
        self.paths = list(Path(data_path).glob('*/*.jpg'))
        
        self.index_to_class = self.__get_classes(data_path)
        self.class_to_index = { class_name: i for i, class_name in enumerate(self.index_to_class) }
        
        self.transform = transform
    
    # private method - class 내부에서만 사용
    def __get_classes(self, data_path):
        return sorted(entry.name for entry in os.scandir(data_path) if entry.is_dir())
    
    # magic method - len: 데이터 길이
    def __len__(self) -> int:
        return len(self.paths)
    
    # magic method - getitem: feature와 target 데이터 추출
    def __getitem__(self, index:int):
        feature_path = self.paths[index]
        img = Image.open(feature_path)
        
        target_name = feature_path.parent.stem
        
        if self.transform:
            img = self.transform(img)
        
        return img, self.class_to_index[target_name]

train_path = ("")
train_custom_dataset = CustomDataset(data_path=train_path, transform=ToTensor())

len(train_custom_dataset)
# 225

feature, target = train_custom_dataset[0]

 

Titanic Dataset Class

# Load Data
import seaborn as sns

df_titanic = sns.load_dataset('titanic')

df_titanic.shape
# (891, 15)

df_titanic.columns
# Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
#        'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
#        'alive', 'alone'],
#       dtype='object')
# Dataset Class
class CustomDataset(Dataset):
    def __init__(self, df:pd.DataFrame, target:str, transform=None) -> None:
        self.features = df.drop(columns=target)
        self.target = df[target]
        self.transform = transform
    
    def __len__(self) -> int:
        return self.features.shape[0]
    
    def __getitem__(self, index):
        feature = self.features.iloc[index]
        target = self.target.iloc[index]
        
        if self.transform:
            feature = self.transform(feature)
        
        return feature, target

titanic_custom_dataset = CustomDataset(df=df_titanic, target='survived')

len(titanic_custom_dataset)

feature, target = titanic_custom_dataset