더보기
27일 차 회고.
오늘 배운 양은 적은 것 같은데 그만큼 중요한 내용이 많았다. 하지만 오늘 특히 피곤한 날이어서 수업이 너무 힘들었다.
1. PyTorch
1-1. Load and Normalize Datasets
Dataset and DataLoader
# 데이터 분석 모듈
import numpy
# 딥러닝 모듈
import torch
# 데이터 처리 모듈
from torch.utils.data import Dataset
# 이미지 모델 모듈
from torchvision import datasets
# 데이터 변환 모듈
from torchvision.transforms import ToTensor, Lambda
# 시각화 모듈
import matplotlib.pyplot as plt
%matplotlib inline
Torch Dataset
# FashionMNIST
training_data = datasets.FashionMNIST(
root='fashion_data', # 저장 폴더명
train=True, # Training Dataset
download=True # Dataset이 이미 다운로드 되어 있다면, 다운로드 생략
)
test_data = datasets.FashionMNIST(
root='fashion_data',
train=False,
download=True,
transform=ToTensor()
)
type(training_data), type(test_data)
# (torchvision.datasets.mnist.FashionMNIST,
# torchvision.datasets.mnist.FashionMNIST)
len(training_data), len(test_data)
# (60000, 10000)
features, target = training_data[0]
np.array(features).shape, target
# ((28, 28), 9)
# Iterate and Visualize the Dataset
labels_map = {
0: 'T-shirt/top',
1: 'Trouser',
2: 'Pullover',
3: 'Dress',
4: 'Coat',
5: 'Sandal',
6: 'Shirt',
7: 'Sneaker',
8: 'Bag',
9: 'Ankle boot'
}
figure = plt.figure(figsize=(8, 8)
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
sample_idx = torch.randint(len(training_data), size=(1,)).item()
img, label = training_data[sample_idx]
figure.add_subplot(rows, cols, i)
plt.title(labels_map[label])
plt.axis("off")
plt.imshow(img, cmap="gray")
plt.show()

# Prepare Data for Training with DataLoader
from torch.utils.data import DataLoader
# 시계열 데이터를 제외한 모든 데이터는 'shuffle=True' 필요
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
len(test_data), len(test_dataloader)
# (10000, 157)
# Iterate through the DataLoader
type(next(iter(test_dataloader))), len(next(iter(test_dataloader)))
# (list, 2)
test_features, test_labels = next(iter(test_dataloader))
type(test_features), type(test_labels)
# (torch.Tensor, torch.Tensor)
test_features.shape
# torch.Size([64, 1, 28, 28]) # (batch, color, row, column)
# color: 1(흑백) / 3(컬러)
test_labels.shape
# torch.Size([64])
img = test_features[0].squeeze()
label = test_labels[0]
plt.axis("off")
plt.imshow(img, cmap="gray")
plt.show()

ImageFolder Dataset
ImageFolder를 사용하기 위해서는 폴더명이 label로 표현되어야 하고, 이미지가 해당 label 폴더 안에 있어야 한다.
# Google Drive
from google.colab import drive
drive.mount('/content/data')
DATA_PATH = ""
# Get Data
import os
from pathlib import Path
data_path = Path(DATA_PATH)
image_path = data_path / "pizza_steak_sushi"
def walk_through_dir(dir_path):
for dirpath, dirnames, filenames in os.walk(dir_path):
print(f"There are {len(dirnames)} directories and {len(filenames)} images in {dirpath}")
walk_through_dir(image_path)
# There are 2 directories and 0 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi
# There are 3 directories and 0 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train
# There are 0 directories and 78 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train/pizza
# ...
# There are 0 directories and 25 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test/pizza
# There are 0 directories and 31 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test/sushi
# There are 0 directories and 19 images in /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test/steak
list(image_path.glob("*")
# [PosixPath('/content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train'),
# PosixPath('/content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/test')]
train_path = image_path / "train"
test_path = image_path / "test"
# Visualize an Image
import random
from PIL import Image
train_image_path_list = list(train_path.glob('*/*.jpg'))
random_sample_image_path = random.choice(train_image_path_list) # choice: 비복원추출 / choices: 복원추출
label = random_sample_image_path.parent.stem
img = Image.open(random_sample_image_path)
label, img.height, img.width
# ('steak', 342, 512')
img

import numpy as np
import matplotlib.pyplot as plt
arr_img = np.asarray(img)
plt.title(label)
plt.imshow(arr_img)
plt.axis(False)
plt.show()

# ImageFolder
from torchvision import datasets
train_dataset = datasets.ImageFolder(
root=train_path,
transform=ToTensor()
)
test_dataset = datasets.ImageFolder(
root=test_path,
transform=ToTensor()
)
len(train_dataset), len(test_dataset)
# (225, 75)
train_dataset
# Dataset ImageFolder
# Number of datapoints: 225
# Root location: /content/data/MyDrive/SKN/AI/3. Deep Learning/1. Introduction Pytorch/data/pizza_steak_sushi/train
# StandardTransform
# Transform: ToTensor()
class_names = train_dataset.classes
class_name
# ['pizza', 'steak', 'sushi']
class_dict = train_dataset.class_to_idx
class_dict
# {'pizza': 0, 'steak': 1, 'sushi': 2}
class_name = 'steak'
class_dict[class_name]
# 1
class_names[class_dict[class_name]]
# 'steak'
img, label = train_dataset[0]
img.shape, label
# (torch.Size([3, 512, 512]), 0) # img: torch.Size([color, row, column])
img_permute = img.permute(1, 2, 0)
plt.title(class_names[label])
plt.imshow(img_permute)
plt.axis(False)
plt.show()

# DataLoader
from torch.utils.data import DataLoader
dl_train = DataLoader(
dataset=train_dataset,
batch_size=1,
shuffle=True
)
len(train_dataset), len(dl_train)
# (225, 225)
imgs, labels = next(iter(dl_train))
imgs.shape, labels.shape
# torch.Size([1, 3, 512, 512]), torch.Size([1])) # img: torch.Size([batch, color, row, column])
Dataset Class
# Create a Dataset
import os
from torch.utils.data import Dataset
from pathlib import Path
from PIL import Image
class CustomDataset(Dataset): # Dataset 상속
# __init__: method에서 사용할 attribute 선언
def __init__(self, data_path:str, transform=None):
self.paths = list(Path(data_path).glob('*/*.jpg'))
self.index_to_class = self.__get_classes(data_path)
self.class_to_index = { class_name: i for i, class_name in enumerate(self.index_to_class) }
self.transform = transform
# private method - class 내부에서만 사용
def __get_classes(self, data_path):
return sorted(entry.name for entry in os.scandir(data_path) if entry.is_dir())
# magic method - len: 데이터 길이
def __len__(self) -> int:
return len(self.paths)
# magic method - getitem: feature와 target 데이터 추출
def __getitem__(self, index:int):
feature_path = self.paths[index]
img = Image.open(feature_path)
target_name = feature_path.parent.stem
if self.transform:
img = self.transform(img)
return img, self.class_to_index[target_name]
train_path = ("")
train_custom_dataset = CustomDataset(data_path=train_path, transform=ToTensor())
len(train_custom_dataset)
# 225
feature, target = train_custom_dataset[0]
Titanic Dataset Class
# Load Data
import seaborn as sns
df_titanic = sns.load_dataset('titanic')
df_titanic.shape
# (891, 15)
df_titanic.columns
# Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
# 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
# 'alive', 'alone'],
# dtype='object')
# Dataset Class
class CustomDataset(Dataset):
def __init__(self, df:pd.DataFrame, target:str, transform=None) -> None:
self.features = df.drop(columns=target)
self.target = df[target]
self.transform = transform
def __len__(self) -> int:
return self.features.shape[0]
def __getitem__(self, index):
feature = self.features.iloc[index]
target = self.target.iloc[index]
if self.transform:
feature = self.transform(feature)
return feature, target
titanic_custom_dataset = CustomDataset(df=df_titanic, target='survived')
len(titanic_custom_dataset)
feature, target = titanic_custom_dataset
'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글
| 29일차. PyTorch - Binary Classification & Multiclass Classification (0) | 2025.02.20 |
|---|---|
| 28일차. PyTorch - Model Layers & Regression (0) | 2025.02.19 |
| 26일차. Deep Learning & PyTorch - Tensor (0) | 2025.02.17 |
| 25일차. AutoML & XAI & Pipeline (0) | 2025.02.14 |
| 24일차. Imbalanced Data & Cross Validation (0) | 2025.02.13 |