더보기
41일 차 회고.
빅데이터분석기사 시험이 4주도 남지 않았다는 사실을 알게 되었다. 빅데이터분석기사는 어렵다고 들었는데 일단 빨리 공부를 시작해야 할 것 같다.
1. CNN 1D Model
1-1. CNN 1D Model
DataLoader
from torch.utils.data import DataLoader
dl_train = DataLoader(
dataset=dt_train,
batch_size=256,
shuffle=True
)
dl_test = DataLoader(
dataset=dt_test,
batch_size=256,
shuffle=False
)
len(dl_train)
# 274
features, targets = next(iter(dl_train))
features.shape, targets.shape
# (torch.Size([256, 52]), torch.Size([256, 1]))
CNN 1D Model
from torch import nn
- Embedding Layer
class EmbeddingLayer(nn.Module):
def __init__(self, vocab_len, embedding=128) -> None:
super().__init__()
self.emb = nn.Embedding(vocab_len, embedding)
def forward(self, x):
return self.emb(x)
# Test
embedding_layer = EmbeddingLayer(len(vocab))
emb_out = embedding_layer(features)
emb_out.shape
# torch.Size([256, 52, 128])
- CNN 1D Layer
class CNN1DLayer(nn.Module):
def __init__(self, embedding) -> None:
super().__init__()
self.cnn_1d = nn.Sequential(
nn.Conv1d(in_channels=embedding, out_channels=embedding*2, kernel_size=3),
nn.ReLU(),
nn.MaxPool1d(2),
nn.Conv1d(in_channels=embedding*2, out_channels=embedding*4, kernel_size=3),
nn.ReLU(),
nn.MaxPool1d(2),
nn.AdaptiveAvgPool1d(1)
)
def forward(self, x):
pre_x = x.permute(0, 2, 1)
out = self.cnn_1d(pre_x)
return out
# Test
cnn1d_layer = CNN1DLayer(128)
cnn_out = cnn1d_layer(emb_out)
cnn_out.shape
# torch.Size([256, 512, 1])
- FC Layer
class FCLayer(nn.Module):
def __init__(self, embedding, target_size=1) -> None:
super().__init__()
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(embedding*4, target_size)
)
def forward(self, x):
return self.fc(x)
# Test
fc_layer = FCLayer(128)
fc_out = fc_layer(cnn_out)
fc_out.shape
# torch.Size([256, 1])
- CNN 1D Model
class CNN1DModel(nn.Module):
def __init__(self, vocab_len, embedding=128, target_size=1) -> None:
super().__init__()
self.emb = EmbeddingLayer(vocab_len=vocab_len, embedding=embedding)
self.cnn = CNN1DLayer(embedding=embedding)
self.fc = FCLayer(embedding=embedding, target_size=target_size)
def forward(self, x):
emb_out = self.emb(x)
cnn_out = self.cnn(emb_out)
fc_out = self.fc(cnn_out)
return fc_out
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
# 'cpu'
cnn_model = CNN1DModel(vocab_len=len(vocab)).to(device)
!pip install torchinfo
import torchinfo
torchinfo.summary(
model=cnn_model,
input_size=(10, 41),
dtypes=[torch.long],
col_names=['kernel_size', 'input_size', 'output_size', 'num_params']
)
"""
============================================================================================================================================
Layer (type:depth-idx) Kernel Shape Input Shape Output Shape Param #
============================================================================================================================================
CNN1DModel -- [10, 41] [10, 1] --
├─EmbeddingLayer: 1-1 -- [10, 41] [10, 41, 128] --
│ └─Embedding: 2-1 -- [10, 41] [10, 41, 128] 3,812,096
├─CNN1DLayer: 1-2 -- [10, 41, 128] [10, 512, 1] --
│ └─Sequential: 2-2 -- [10, 128, 41] [10, 512, 1] --
│ │ └─Conv1d: 3-1 [3] [10, 128, 41] [10, 256, 39] 98,560
│ │ └─ReLU: 3-2 -- [10, 256, 39] [10, 256, 39] --
│ │ └─MaxPool1d: 3-3 2 [10, 256, 39] [10, 256, 19] --
│ │ └─Conv1d: 3-4 [3] [10, 256, 19] [10, 512, 17] 393,728
│ │ └─ReLU: 3-5 -- [10, 512, 17] [10, 512, 17] --
│ │ └─MaxPool1d: 3-6 2 [10, 512, 17] [10, 512, 8] --
│ │ └─AdaptiveAvgPool1d: 3-7 -- [10, 512, 8] [10, 512, 1] --
├─FCLayer: 1-3 -- [10, 512, 1] [10, 1] --
│ └─Sequential: 2-3 -- [10, 512, 1] [10, 1] --
│ │ └─Flatten: 3-8 -- [10, 512, 1] [10, 512] --
│ │ └─Linear: 3-9 -- [10, 512] [10, 1] 513
============================================================================================================================================
Total params: 4,304,897
Trainable params: 4,304,897
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 143.50
============================================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 1.91
Params size (MB): 17.22
Estimated Total Size (MB): 19.14
============================================================================================================================================
"""
Engine
- Train Step
def train_step(
model:torch.nn.Module,
dataloader:torch.utils.data.DataLoader,
loss_fn:torch.nn.Module,
optimizer:torch.optim.Optimizer,
device:str
):
model.train()
train_loss, train_acc = 0, 0
for X, y in tqdm(dataloader, desc='train step', leave=False):
X, y = X.to(device), y.to(device)
y_pred = model(X)
loss = loss_fn(y_pred, y)
train_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
train_acc += (y_pred_class == y).sum().item() / len(y_pred)
train_loss = train_loss / len(dataloader)
train_acc = train_acc / len(dataloader)
return train_loss, train_acc
- Test Step
def test_step(
model:torch.nn.Module,
dataloader:torch.utils.data.DataLoader,
loss_fn:torch.nn.Module,
device:str
):
model.eval()
test_loss, test_acc = 0, 0
with torch.inference_mode():
for X, y in tqdm(dataloader, desc='test step', leave=False):
X, y = X.to(device), y.to(device)
test_pred_logits = model(X)
loss = loss_fn(test_pred_logits, y)
test_loss += loss.item()
test_pred_labels = test_pred_logits.argmax(dim=1)
test_acc += ((test_pred_labels == y).sum().item() / len(test_pred_labels))
test_loss = test_loss / len(dataloader)
test_acc = test_acc / len(dataloader)
return test_loss, test_acc
- Early Stop
class EarlyStopper(object):
def __init__(self, num_trials, save_path):
self.num_trials = num_trials
self.trial_counter = 0
self.best_loss = np.inf
self.save_path = save_path
def is_continuable(self, model, loss):
if loss < self.best_loss:
self.best_loss = loss
self.trial_counter = 0
torch.save(model, self.save_path)
return True
elif self.trial_counter + 1 < self.num_trials:
self.trial_counter += 1
return True
else:
return False
def get_best_model(self, device):
return torch.load(self.save_path).to(device)
- Plot Loss
import matplotlib.pyplot as plt
def plot_loss_curves(results):
loss = results['train_loss']
test_loss = results['test_loss']
accuracy = results['train_acc']
test_accuracy = results['test_acc']
epochs = range(len(results['train_loss']))
plt.figure(figsize=(15, 7))
plt.subplot(1, 2, 1)
plt.plot(epochs, loss, label='Train Loss')
plt.plot(epochs, test_loss, label='Test Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(epochs, accuracy, label='Train Accuracy')
plt.plot(epochs, test_accuracy, label='Test Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.legend()
- Main
def main(
model:torch.nn.Module,
train_dataloader:torch.utils.data.DataLoader,
test_dataloader:torch.utils.data.DataLoader,
optimizer:torch.optim.Optimizer,
early_stopper,
loss_fn:torch.nn.Module,
device:str,
epochs:int=5
):
results = {
'train_loss': [],
'train_acc': [],
'test_loss': [],
'test_acc': []
}
model.to(device)
for epoch in tqdm(range(epochs)):
train_loss, train_acc = train_step(
model=model,
dataloader=train_dataloader,
loss_fn=loss_fn,
optimizer=optimizer,
device=device
)
test_loss, test_acc = test_step(
model=model,
dataloader=test_dataloader,
loss_fn=loss_fn,
device=device
)
print(
f'Epoch: {epoch + 1} | '
f'train_loss: {train_loss:.4f} | '
f'train_acc: {train_acc:.4f} | '
f'test_loss: {test_loss:.4f} | '
f'test_acc: {test_acc:.4f}'
)
results['train_loss'].append(train_loss)
results['train_acc'].append(train_acc)
results['test_loss'].append(test_loss)
results['test_acc'].append(test_acc)
if not early_stopper.is_continuable(model, test_loss):
print(f'validation: best loss: {early_stopper.best_loss}')
break
return results
Training
- Accuracy 값이 제대로 나오지 않는 오류가 발생한다.
model = CNN1DModel(len(vocab)).to(device)
train_dataloader = DataLoader(dataset=dt_train, batch_size=256, shuffle=True)
test_dataloader = DataLoader(dataset=dt_test, batch_size=256, shuffle=True)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
early_stopper = EarlyStopper(num_trials=10, save_path='./trained_model.pth')
loss_fn = nn.BCEWithLogitsLoss()
result = main(
model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
optimizer=optimizer,
early_stopper=early_stopper
loss_fn=loss_fn,
device=device,
dpochs=50
)
1-2. CNN 1D Model with Cross Validation
Load Data
from google.colab import drive
drive.mount('/content/data')
import pandas as pd
data_path = ''
df_ko = pd.read_csv(data_path + 'ratings_train.txt', sep='\t')
df_ko = df_ko[:100000]
df_ko.shape
# (100000, 3)
Cleaning
df_ko.dropna(inplace=True)
df_ko['document'] = df_ko['document'].map(
lambda x: x.replace('..', '').replace('ㅡㅡ', '').replace('ㅠ.ㅠ', '')\
.replace('ㅋ', '').replace('ㅎ', '').replace('~', '').replace('^^', '')
)
Tokenization
!pip install kiwipiepy
- Tokenizer 생성
from kiwipiepy import Kiwi
kiwi = Kiwi()
- Stemming / Stopword
from kiwipiepy.utils import Stopwords
stopwords = Stopwords()
def tokenizer(text):
tokens = kiwi.tokenize(text, stopwords=stopwords)
return [t.form for t in tokens if t.tag[0] in 'NJMV']
- 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab_from_iterator
from tqdm.auto import tqdm
def yield_tokens(data, tokenizer):
for text in tqdm(data):
yield tokenizer(text)
gen = yield_tokens(df_ko['document'], tokenizer)
vocab=build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])
len(vocab)
# 35735
Dataset Class
import numpy as np
import torch
from torch.utils.data import Dataset
class ReviewDataset(Dataset):
def __init__(self, vocab, tokenizer, features, targets=None, max_len=None) -> None:
super().__init__()
self.max_len = max_len
self.targets = np.array(targets).reshape(-1, 1)
self.do_tokenize(vocab, tokenizer, features)
def do_tokenize(self, vocab, tokenizer, features):
_features = [vocab(tokenizer(text)) for text in tqdm(features)]
if self.max_len is None:
self.max_len = max([len(token) for token in _features])
self.features = np.array([
token + vocab(['<pad>']) * (self.max_len - len(token))
if len(token) <= self.max_len else token[:self.max_len]
for token in tqdm(_features)
])
def __len__(self):
return self.features.shape[0]
def __getitem__(self, index:int):
feature = torch.LongTensor(self.features[index])
target = None
if self.targets is not None:
target = torch.Tensor(self.targets[index])
return feature, target
dt = ReviewDataset(
vocab=vocab,
tokenizer=tokenizer,
features=df_ko['document'].tolist(),
targets=df_ko['label'].tolist()
)
len(dt)
# 99998
CNN 1D Model
- Embedding Layer
class EmbeddingLayer(nn.Module):
def __init__(self, vocab_len, embedding=128) -> None:
super().__init__()
self.emb = nn.Embedding(vocab_len, embedding)
def forward(self, x):
return self.emb(x)
- CNN 1D Layer
class CNN1DLayer(nn.Module):
def __init__(self, embedding) -> None:
super().__init__()
self.cnn_1d = nn.Sequential(
nn.Conv1d(in_channels=embedding, out_channels=embedding*2, kernel_size=3),
nn.ReLU(),
nn.MaxPool1d(2),
nn.Conv1d(in_channels=embedding*2, out_channels=embedding*4, kernel_size=3),
nn.ReLU(),
nn.MaxPool1d(2),
nn.AdaptiveAvgPool1d(1)
)
def forward(self, x):
pre_x = x.permute(0, 2, 1)
out = self.cnn_1d(pre_x)
return out
- FC Layer
class FCLayer(nn.Module):
def __init__(self, embedding, target_size=1) -> None:
super().__init__()
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(embedding*4, target_size)
)
def forward(self, x):
return self.fc(x)
- CNN 1D Model
class CNN1DModel(nn.Module):
def __init__(self, vocab_len, embedding=128, target_size=1) -> None:
super().__init__()
self.emb = EmbeddingLayer(vocab_len=vocab_len, embedding=embedding)
self.cnn = CNN1DLayer(embedding=embedding)
self.fc = FCLayer(embedding=embedding, target_size=target_size)
def forward(self, x):
emb_out = self.emb(x)
cnn_out = self.cnn(emb_out)
fc_out = self.fc(cnn_out)
return fc_out
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
# 'cpu'
cnn_model = CNN1DModel(vocab_len=len(vocab)).to(device)
Engine
- Train step
def train_step(
model:torch.nn.Module,
dataloader:torch.utils.data.DataLoader,
loss_fn:torch.nn.Module,
optimizer:torch.optim.Optimizer,
device:str
):
model.train()
train_loss, train_acc = 0, 0
for X, y in tqdm(dataloader, desc='train step', leave=False):
X, y = X.to(device), y.to(device)
y_pred = model(X)
loss = loss_fn(y_pred, y)
train_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
train_acc += (y_pred_class == y).sum().item() / len(y_pred)
train_loss = train_loss / len(dataloader)
train_acc = train_acc / len(dataloader)
return train_loss, train_acc
- Test step
def test_step(
model:torch.nn.Module,
dataloader:torch.utils.data.DataLoader,
loss_fn:torch.nn.Module,
device:str
):
model.eval()
test_loss, test_acc = 0, 0
with torch.inference_mode():
for X, y in tqdm(dataloader, desc='test step', leave=False):
X, y = X.to(device), y.to(device)
test_pred_logits = model(X)
loss = loss_fn(test_pred_logits, y)
test_loss += loss.item()
test_pred_labels = test_pred_logits.argmax(dim=1)
test_acc += ((test_pred_labels == y).sum().item() / len(test_pred_labels))
test_loss = test_loss / len(dataloader)
test_acc = test_acc / len(dataloader)
return test_loss, test_acc
- Early Stop
class EarlyStopper(object):
def __init__(self, num_trials, save_path):
self.num_trials = num_trials
self.trial_counter = 0
self.best_loss = np.inf
self.save_path = save_path
def is_continuable(self, model, loss):
if loss < self.best_loss:
self.best_loss = loss
self.trial_counter = 0
torch.save(model, self.save_path)
return True
elif self.trial_counter + 1 < self.num_trials:
self.trial_counter += 1
return True
else:
return False
def get_best_model(self, device):
return torch.load(self.save_path).to(device)
- Plot Loss
import matplotlib.pyplot as plt
def plot_loss_curves(results):
loss = results['train_loss']
test_loss = results['test_loss']
accuracy = results['train_acc']
test_accuracy = results['test_acc']
epochs = range(len(results['train_loss']))
plt.figure(figsize=(15, 7))
plt.subplot(1, 2, 1)
plt.plot(epochs, loss, label='Train Loss')
plt.plot(epochs, test_loss, label='Test Loss')
plt.title('Loss')
plt.xlabel('Epochs')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(epochs, accuracy, label='Train Accuracy')
plt.plot(epochs, test_accuracy, label='Test Accuracy')
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.legend()
- Main
def main(
model:torch.nn.Module,
train_dataloader:torch.utils.data.DataLoader,
test_dataloader:torch.utils.data.DataLoader,
optimizer:torch.optim.Optimizer,
early_stopper,
loss_fn:torch.nn.Module,
device:str,
epochs:int=5
):
results = {
'train_loss': [],
'train_acc': [],
'test_loss': [],
'test_acc': []
}
model.to(device)
for epoch in tqdm(range(epochs)):
train_loss, train_acc = train_step(
model=model,
dataloader=train_dataloader,
loss_fn=loss_fn,
optimizer=optimizer,
device=device
)
test_loss, test_acc = test_step(
model=model,
dataloader=test_dataloader,
loss_fn=loss_fn,
device=device
)
print(
f'Epoch: {epoch + 1} | '
f'train_loss: {train_loss:.4f} | '
f'train_acc: {train_acc:.4f} | '
f'test_loss: {test_loss:.4f} | '
f'test_acc: {test_acc:.4f}'
)
results['train_loss'].append(train_loss)
results['train_acc'].append(train_acc)
results['test_loss'].append(test_loss)
results['test_acc'].append(test_acc)
if not early_stopper.is_continuable(model, test_loss):
print(f'validation: best loss: {early_stopper.best_loss}')
break
return results
Training
- K-Fold
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True)
- Optimizer
optimizer = torch.optim.AdamW(params=cnn_model.parameters(), lr=0.001)
- Early Stopper
early_stopper = EarlyStopper(num_trials=5, save_path='./trained_model.pth')
- Loss Function
loss_fn = nn.BCEWithLogitsLoss()
- Training
from torch.utils.data import DataLoader
for tr_indexs, val_indexs in cv.split(df_ko, df_ko['label']):
label_train = df_ko['label'].iloc[tr_indexs]
label_valid = df_ko['label'].iloc[val_indexs]
feature_train = df_ko['document'].iloc[tr_indexs]
feature_valid = df_ko['document'].iloc[val_indexs]
dt_train = ReviewDataset(vocab=vocab, tokenizer=tokenizer, features=feature_train.tolist(), targets=label_train.tolist())
dt_test = ReviewDataset(vocab=vocab, tokenizer=tokenizer, features=feature_valid.tolist(), targets=label_valid.tolist())
dl_train = DataLoader(dt_train, batch_size=256, shuffle=True)
dl_test = DataLoader(dt_test, batch_size=256, shuffle=True)
result = main(
model=cnn.model,
train_dataloader=dl_train,
test_dataloader=dl_test,
optimizer=optimizer,
early_stopper=early_stopper,
loss_fn=loss_fn,
device=device,
epochs=10
)
1-3. CNN 1D Model with HPO
Install
- Ray Tune
- 분산 하이퍼파라미터 튜닝을 위한 업계 표준 도구
- 최신 하이퍼파라미터 검색 알고리즘 포함
- TensorBoard 및 기타 분석 라이브러리와 통합
- Ray의 분산 기계 학습 엔진을 통해 학습 지원
!pip install -q ray[tune] # HPO 튜닝 모듈
!pip install -U torchtext==0.15.2
Import
import numpy as np
import pandas as pd
import torch
from torch import nn
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import spacy
from ray import tune
from ray import train
from ray.train import get_checkpoint, Checkpoint
from ray.tune.schedulers import ASHAScheduler
from functools import partial
import os
import tempfile
from pathlib import Path
import ray.cloudpickle as pickle
from tqdm.auto import tqdm
Global Variables
import easydict
args = easydict.EasyDict()
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
args.epoches = 50
args.trials = 5
args.save_model_name = './best_trained_model.pth'
Load Data
from google.colab import drive
drive.mount('/content/data')
default_path = ''
df_iphone = pd.read_csv(default_path + 'iphone.csv')
df_iphone.shape
# (3062, 11)
Engine
- Reset Seeds
import random
import os
def reset_seeds(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
- Train Loop
def train_loop(model, dataloader, loss_fn, optimizer, device):
model.train()
epoch_loss, total_acc = 0, 0
sig = torch.nn.Sigmoid()
for batch in dataloader:
pred = model(batch['features'].to(device))
loss = loss_fn(pred, batch['target'].to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
pred_ = sig(pred)
pred_ = pred_.to('cpu').detach().numpy()
pred_ = (pred_ > 0.5).astype(int)
total_acc += accuracy_score(batch['target'].to('cpu').numpy(), pred_)
epoch_loss /= len(dataloader)
total_acc /= len(dataloader)
return epoch_loss, total_acc
- Test Loop
@torch.inference_mode()
def test_loop(model, dataloader, loss_fn, device):
model.eval()
epoch_loss, total_acc = 0, 0
pred_list = []
sig = torch.nn.Sigmoid()
for batch in dataloader:
pred = model(batch['features'].to(device))
pred_ = sig(pred)
pred_ = pred_.to('cpu').numpy()
pred_list.append(pred_)
if batch.get('target') is not None:
loss = loss_fn(pred, batch['target'].to(device))
epoch_loss += loss.item()
pred_ = (pred_ > 0.5).astype(int)
total_acc += accuracy_score(batch['target'].to('cpu').numpy(), pred_)
epoch_loss /= len(dataloader)
total_acc /= len(dataloader)
return epoch_loss, total_acc
- Early Stop
class EarlyStopper(object):
def __init__(self, num_trials, save_path):
self.num_trials = num_trials
self.trial_counter = 0
self.best_loss = np.int
self.save_path = save_path
def is_continuable(self, model, loss):
if loss < self.best_loss:
self.best_loss = loss
self.trial_counter = 0
torch.save(model, self.save_path)
return True
elif self.trial_counter + 1 < self.num_trials:
self.trial_counter += 1
return True
else:
return False
def get_best_model(self, device=args.device):
return torch.load(self.save_path).to(device)
Model
- Embedding Layer
class EmbeddingLayer(nn.Module):
def __init__(self, vocab_len, embedding=128) -> None:
super().__init__()
self.emb = nn.Embedding(vocab_len, embedding)
def forward(self, x):
return self.emb(x)
- CNN Layer
class CNN1DLayer(nn.Module):
def __init__(self, embedding) -> None:
super().__init__()
self.cnn_1d = nn.Sequential(
nn.Conv1d(in_channels=embedding, out_channels=embedding*2, kernel_size=3),
nn.ReLU(),
nn.MaxPool1d(2),
nn.Conv1d(in_channels=embedding*2, out_channels=embedding*4, kernel_size=3),
nn.ReLU(),
nn.MaxPool1d(2),
nn.AdaptiveAvgPool1d(1)
)
def forward(self, x):
pre_x = x_permute(0, 2, 1)
out = self.cnn_1d(pre_x)
return out
- FC Layer
class FCLayer(nn.Module):
def __init__(self, embedding, target_size=1) -> None:
super().__init__()
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(embedding*4, target_size)
)
def forward(self, x):
return self.fc(x)
- CNN 1D Model
class CNN1DModel(nn.Module):
def __init__(self, vocab_len, embedding=128, target_size=1) -> None:
super().__init__()
self.emb = EmbeddingLayer(vocab_len=vocab_len, embedding=embedding)
self.cnn = CNN1DLayer(embedding=embedding)
self.fc = FCLayer(embedding=embedding, target_size=target_size)
def forward(self, x):
emb_out = self.emb(x)
cnn_out = self.cnn(emb_out)
fc_out = self.fc(cnn_out)
return fc_out
Text Preprocessing
df_iphone.dropna(inplace=True)
df_iphone['feature'] = df_iphone.apply(
lambda row: str(row['reviewTitle']).strip() + ' ' + str(row['reviewDescription']).strip(),
axis=1
)
df_iphone['target'] = df_iphone['ratingScore'].map(
lambda x: 1 if x >= 5 else 0
)
- Tokenization
import spacy
nlp = spacy.load('en_core_web_sm')
def tokenizer(text):
tokens = nlp(text)
return [token.lemma_ for token in tokens if token.tag_[0] in 'NVJ']
def yield_tokens(datas, tokenizer):
for data in tqdm(datas, desc='tokenizing...', leave=False):
yield tokenizer(data)
gen = yield_tokens(df_iphone['feature'], tokenizer)
vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])
len(vocab)
# 6084
Dataset
def main_by_ray(config, data_dir=None):
net = CNN1DModel(len(vocab), config['embedding_dim'])
device = 'cpu'
if torch.cuda.is_available():
device = 'cuda:0'
if torch.cuda.device_count() > 1:
net = nn.DataParallel(net)
net.to(device)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=net.parameters(), lr=config['lr'])
checkpoint = get_checkpoint()
if checkpoint:
with checkpoint.as_directory() as checkpoint_dir:
data_path = Path(checkpoint_dir) / 'data.pkl'
with open(data_path, 'rb') as fp:
checkpoint_state = pickle.load(fp)
start_epoch = checkpoint_state['epoch']
net.load_state_dict(checkpoint_state['net_state_dict'])
optimizer.load_state_dict(checkpoint_state['optimizer_state_dict'])
else:
start_epoch = 0
dt_train = MyDataset(vocab, tokenizer, df_train)
dt_test = MyDataset(vocab, tokenizer, df_test)
dl_train = DataLoader(dt_train, batch_size=config['batch_size'], shuffle=True)
dl_test = DataLoader(dt_test, batch_size=config['batch_size'], shuffle=False)
for epoch in range(start_epoch, args.epoches):
train_loss, train_acc = train_loop(
model=net, dataloader=dl_train, loss_fn=loss_fn, optimizer=optimizer, device=device
)
test_loss, test_acc = test_loop(
model=net, dataloader=dl_test, loss_fn=loss_fn, device=device
)
checkpoint_data = {
'epoch': epoch,
'net_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict()
}
with tempfile.TemporaryDirectory() as checkpoint_dir:
data_path = Path(checkpoint_dir) / 'data.pkl'
with open(data_path, 'wb') as fp:
pickle.dump(checkpoint_data, fp)
Training
- 리소스가 부족하여 정상적으로 실행하지 못하였다.
df_train, df_test = train_test_split(df_iphone, test_size=0.2, shuffle=True, stratify=df_iphone['target'])
def trial_str_creator(trial):
return '{}_{}_trial'.format(trial.trainable_name, trial.trial_id)
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
data_dir = os.path.abspath('./load_data')
config = {
'embedding_dim': tune.choice([64, 128, 256]),
'lr': tune.loguniform(1e-4, 1e-1),
'batch_size': tune.choice([16, 32, 64])
}
scheduler = ASHAScheduler(
metric='loss',
mode='min',
max_t=max_num_epochs,
grace_period=1,
reduction_factor=2
)
reporter = tune.JupyterNotebookReporter(
metric_columns=['loss', 'accuracy', 'training_iteration']
)
result = tune.run(
partial(main_by_ray, data_dir=data_dir),
resources_per_tirla={
'cpu': 6,
'gpu': gpus_per_trial
},
config=config,
num_samples=num_samples,
scheduler=scheduler,
trial_dirname_creator=trial_str_creator
)
return result
result = main(num_samples=4, max_num_epochs=5)
'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글
| 43일차. 자연어 딥러닝 - RNN (0) | 2025.03.13 |
|---|---|
| 42일차. 자연어 딥러닝 - RNN (0) | 2025.03.12 |
| 40일차. 자연어 데이터 준비 - Text Preprocessing & 1D CNN (0) | 2025.03.10 |
| 39일차. 자연어 데이터 준비 - 형태소 분석 & 어휘집 & Padding (0) | 2025.03.07 |
| 38일차. 자연어 데이터 준비 - NLP & Integer Encoding & Word2Vec (0) | 2025.03.06 |