본문 바로가기

SK네트웍스 Family AI캠프 10기/Daily 회고

41일차. 자연어 딥러닝 - CNN 1D Model

더보기

 

41일 차 회고.

 

 빅데이터분석기사 시험이 4주도 남지 않았다는 사실을 알게 되었다. 빅데이터분석기사는 어렵다고 들었는데 일단 빨리 공부를 시작해야 할 것 같다.

 

 

 

 

1. CNN 1D Model

 

 

1-1. CNN 1D Model

 

DataLoader

from torch.utils.data import DataLoader

dl_train = DataLoader(
    dataset=dt_train,
    batch_size=256,
    shuffle=True
)
dl_test = DataLoader(
    dataset=dt_test,
    batch_size=256,
    shuffle=False
)

len(dl_train)
# 274

features, targets = next(iter(dl_train))
features.shape, targets.shape
# (torch.Size([256, 52]), torch.Size([256, 1]))

 

CNN 1D Model

 

from torch import nn
  • Embedding Layer
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_len, embedding=128) -> None:
        super().__init__()
        self.emb = nn.Embedding(vocab_len, embedding)
    
    def forward(self, x):
        return self.emb(x)

# Test
embedding_layer = EmbeddingLayer(len(vocab))
emb_out = embedding_layer(features)
emb_out.shape
# torch.Size([256, 52, 128])
  • CNN 1D Layer
class CNN1DLayer(nn.Module):
    def __init__(self, embedding) -> None:
        super().__init__()
        self.cnn_1d = nn.Sequential(
            nn.Conv1d(in_channels=embedding, out_channels=embedding*2, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(in_channels=embedding*2, out_channels=embedding*4, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.AdaptiveAvgPool1d(1)
        )
    
    def forward(self, x):
        pre_x = x.permute(0, 2, 1)
        out = self.cnn_1d(pre_x)
        return out

# Test
cnn1d_layer = CNN1DLayer(128)
cnn_out = cnn1d_layer(emb_out)
cnn_out.shape
# torch.Size([256, 512, 1])
  • FC Layer
class FCLayer(nn.Module):
    def __init__(self, embedding, target_size=1) -> None:
        super().__init__()
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(embedding*4, target_size)
        )
    
    def forward(self, x):
        return self.fc(x)

# Test
fc_layer = FCLayer(128)
fc_out = fc_layer(cnn_out)
fc_out.shape
# torch.Size([256, 1])
  • CNN 1D Model
class CNN1DModel(nn.Module):
    def __init__(self, vocab_len, embedding=128, target_size=1) -> None:
        super().__init__()
        self.emb = EmbeddingLayer(vocab_len=vocab_len, embedding=embedding)
        self.cnn = CNN1DLayer(embedding=embedding)
        self.fc = FCLayer(embedding=embedding, target_size=target_size)
    
    def forward(self, x):
        emb_out = self.emb(x)
        cnn_out = self.cnn(emb_out)
        fc_out = self.fc(cnn_out)
        return fc_out

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
# 'cpu'

cnn_model = CNN1DModel(vocab_len=len(vocab)).to(device)
!pip install torchinfo
import torchinfo

torchinfo.summary(
    model=cnn_model,
    input_size=(10, 41),
    dtypes=[torch.long],
    col_names=['kernel_size', 'input_size', 'output_size', 'num_params']
)
"""
============================================================================================================================================
Layer (type:depth-idx)                   Kernel Shape              Input Shape               Output Shape              Param #
============================================================================================================================================
CNN1DModel                               --                        [10, 41]                  [10, 1]                   --
├─EmbeddingLayer: 1-1                    --                        [10, 41]                  [10, 41, 128]             --
│    └─Embedding: 2-1                    --                        [10, 41]                  [10, 41, 128]             3,812,096
├─CNN1DLayer: 1-2                        --                        [10, 41, 128]             [10, 512, 1]              --
│    └─Sequential: 2-2                   --                        [10, 128, 41]             [10, 512, 1]              --
│    │    └─Conv1d: 3-1                  [3]                       [10, 128, 41]             [10, 256, 39]             98,560
│    │    └─ReLU: 3-2                    --                        [10, 256, 39]             [10, 256, 39]             --
│    │    └─MaxPool1d: 3-3               2                         [10, 256, 39]             [10, 256, 19]             --
│    │    └─Conv1d: 3-4                  [3]                       [10, 256, 19]             [10, 512, 17]             393,728
│    │    └─ReLU: 3-5                    --                        [10, 512, 17]             [10, 512, 17]             --
│    │    └─MaxPool1d: 3-6               2                         [10, 512, 17]             [10, 512, 8]              --
│    │    └─AdaptiveAvgPool1d: 3-7       --                        [10, 512, 8]              [10, 512, 1]              --
├─FCLayer: 1-3                           --                        [10, 512, 1]              [10, 1]                   --
│    └─Sequential: 2-3                   --                        [10, 512, 1]              [10, 1]                   --
│    │    └─Flatten: 3-8                 --                        [10, 512, 1]              [10, 512]                 --
│    │    └─Linear: 3-9                  --                        [10, 512]                 [10, 1]                   513
============================================================================================================================================
Total params: 4,304,897
Trainable params: 4,304,897
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 143.50
============================================================================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 1.91
Params size (MB): 17.22
Estimated Total Size (MB): 19.14
============================================================================================================================================
"""

 

 

Engine

  • Train Step
def train_step(
        model:torch.nn.Module,
        dataloader:torch.utils.data.DataLoader,
        loss_fn:torch.nn.Module,
        optimizer:torch.optim.Optimizer,
        device:str
):
    model.train()
    
    train_loss, train_acc = 0, 0
    
    for X, y in tqdm(dataloader, desc='train step', leave=False):
        X, y = X.to(device), y.to(device)
        
        y_pred = model(X)
        
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).sum().item() / len(y_pred)
    
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    
    return train_loss, train_acc
  • Test Step
def test_step(
        model:torch.nn.Module,
        dataloader:torch.utils.data.DataLoader,
        loss_fn:torch.nn.Module,
        device:str
):
    model.eval()
    
    test_loss, test_acc = 0, 0
    
    with torch.inference_mode():
        for X, y in tqdm(dataloader, desc='test step', leave=False):
            X, y = X.to(device), y.to(device)
            
            test_pred_logits = model(X)
            
            loss = loss_fn(test_pred_logits, y)
            test_loss += loss.item()
            
            test_pred_labels = test_pred_logits.argmax(dim=1)
            test_acc += ((test_pred_labels == y).sum().item() / len(test_pred_labels))
    
    test_loss = test_loss / len(dataloader)
    test_acc = test_acc / len(dataloader)
    
    return test_loss, test_acc
  • Early Stop
class EarlyStopper(object):
    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_loss = np.inf
        self.save_path = save_path
    
    def is_continuable(self, model, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.trial_counter = 0
            torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False
    
    def get_best_model(self, device):
        return torch.load(self.save_path).to(device)
  • Plot Loss
import matplotlib.pyplot as plt

def plot_loss_curves(results):
    loss = results['train_loss']
    test_loss = results['test_loss']
    
    accuracy = results['train_acc']
    test_accuracy = results['test_acc']
    
    epochs = range(len(results['train_loss']))
    
    plt.figure(figsize=(15, 7))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, loss, label='Train Loss')
    plt.plot(epochs, test_loss, label='Test Loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs, accuracy, label='Train Accuracy')
    plt.plot(epochs, test_accuracy, label='Test Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.legend()
  • Main
def main(
        model:torch.nn.Module,
        train_dataloader:torch.utils.data.DataLoader,
        test_dataloader:torch.utils.data.DataLoader,
        optimizer:torch.optim.Optimizer,
        early_stopper,
        loss_fn:torch.nn.Module,
        device:str,
        epochs:int=5
):
    results = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': []
    }
    
    model.to(device)
    
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device
        )
        test_loss, test_acc = test_step(
            model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn,
            device=device
        )
        
        print(
            f'Epoch: {epoch + 1} | '
            f'train_loss: {train_loss:.4f} | '
            f'train_acc: {train_acc:.4f} | '
            f'test_loss: {test_loss:.4f} | '
            f'test_acc: {test_acc:.4f}'
        )
        
        results['train_loss'].append(train_loss)
        results['train_acc'].append(train_acc)
        results['test_loss'].append(test_loss)
        results['test_acc'].append(test_acc)
        
        if not early_stopper.is_continuable(model, test_loss):
            print(f'validation: best loss: {early_stopper.best_loss}')
            break
    
    return results

 

Training

  • Accuracy 값이 제대로 나오지 않는 오류가 발생한다.
model = CNN1DModel(len(vocab)).to(device)

train_dataloader = DataLoader(dataset=dt_train, batch_size=256, shuffle=True)
test_dataloader = DataLoader(dataset=dt_test, batch_size=256, shuffle=True)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
early_stopper = EarlyStopper(num_trials=10, save_path='./trained_model.pth')
loss_fn = nn.BCEWithLogitsLoss()

result = main(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    early_stopper=early_stopper
    loss_fn=loss_fn,
    device=device,
    dpochs=50
)

 

 

1-2. CNN 1D Model with Cross Validation

 

Load Data

from google.colab import drive
drive.mount('/content/data')
import pandas as pd

data_path = ''
df_ko = pd.read_csv(data_path + 'ratings_train.txt', sep='\t')
df_ko = df_ko[:100000]
df_ko.shape
# (100000, 3)

 

Cleaning

df_ko.dropna(inplace=True)

df_ko['document'] = df_ko['document'].map(
    lambda x: x.replace('..', '').replace('ㅡㅡ', '').replace('ㅠ.ㅠ', '')\
    .replace('ㅋ', '').replace('ㅎ', '').replace('~', '').replace('^^', '')
)

 

Tokenization

!pip install kiwipiepy
  • Tokenizer 생성
from kiwipiepy import Kiwi

kiwi = Kiwi()
  • Stemming / Stopword
from kiwipiepy.utils import Stopwords

stopwords = Stopwords()

def tokenizer(text):
    tokens = kiwi.tokenize(text, stopwords=stopwords)
    
    return [t.form for t in tokens if t.tag[0] in 'NJMV']
  • 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab_from_iterator
from tqdm.auto import tqdm

def yield_tokens(data, tokenizer):
    for text in tqdm(data):
        yield tokenizer(text)

gen = yield_tokens(df_ko['document'], tokenizer)

vocab=build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])
len(vocab)
# 35735

 

Dataset Class

import numpy as np
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, vocab, tokenizer, features, targets=None, max_len=None) -> None:
        super().__init__()
        self.max_len = max_len
        self.targets = np.array(targets).reshape(-1, 1)
        self.do_tokenize(vocab, tokenizer, features)
    
    def do_tokenize(self, vocab, tokenizer, features):
        _features = [vocab(tokenizer(text)) for text in tqdm(features)]
        
        if self.max_len is None:
            self.max_len = max([len(token) for token in _features])
        
        self.features = np.array([
            token + vocab(['<pad>']) * (self.max_len - len(token))
            if len(token) <= self.max_len else token[:self.max_len]
            for token in tqdm(_features)
        ])
    
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index:int):
        feature = torch.LongTensor(self.features[index])
        target = None
        if self.targets is not None:
            target = torch.Tensor(self.targets[index])
        return feature, target

dt = ReviewDataset(
    vocab=vocab,
    tokenizer=tokenizer,
    features=df_ko['document'].tolist(),
    targets=df_ko['label'].tolist()
)
len(dt)
# 99998

 

CNN 1D Model

  • Embedding Layer
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_len, embedding=128) -> None:
        super().__init__()
        self.emb = nn.Embedding(vocab_len, embedding)
    
    def forward(self, x):
        return self.emb(x)
  • CNN 1D Layer
class CNN1DLayer(nn.Module):
    def __init__(self, embedding) -> None:
        super().__init__()
        self.cnn_1d = nn.Sequential(
            nn.Conv1d(in_channels=embedding, out_channels=embedding*2, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(in_channels=embedding*2, out_channels=embedding*4, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.AdaptiveAvgPool1d(1)
        )
    
    def forward(self, x):
        pre_x = x.permute(0, 2, 1)
        out = self.cnn_1d(pre_x)
        return out
  • FC Layer
class FCLayer(nn.Module):
    def __init__(self, embedding, target_size=1) -> None:
        super().__init__()
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(embedding*4, target_size)
        )
    
    def forward(self, x):
        return self.fc(x)
  • CNN 1D Model
class CNN1DModel(nn.Module):
    def __init__(self, vocab_len, embedding=128, target_size=1) -> None:
        super().__init__()
        self.emb = EmbeddingLayer(vocab_len=vocab_len, embedding=embedding)
        self.cnn = CNN1DLayer(embedding=embedding)
        self.fc = FCLayer(embedding=embedding, target_size=target_size)
    
    def forward(self, x):
        emb_out = self.emb(x)
        cnn_out = self.cnn(emb_out)
        fc_out = self.fc(cnn_out)
        return fc_out

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
# 'cpu'

cnn_model = CNN1DModel(vocab_len=len(vocab)).to(device)

 

Engine

  • Train step
def train_step(
        model:torch.nn.Module,
        dataloader:torch.utils.data.DataLoader,
        loss_fn:torch.nn.Module,
        optimizer:torch.optim.Optimizer,
        device:str
):
    model.train()
    
    train_loss, train_acc = 0, 0
    
    for X, y in tqdm(dataloader, desc='train step', leave=False):
        X, y = X.to(device), y.to(device)
        
        y_pred = model(X)
        
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).sum().item() / len(y_pred)
    
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    
    return train_loss, train_acc
  • Test step
def test_step(
        model:torch.nn.Module,
        dataloader:torch.utils.data.DataLoader,
        loss_fn:torch.nn.Module,
        device:str
):
    model.eval()
    
    test_loss, test_acc = 0, 0
    
    with torch.inference_mode():
        for X, y in tqdm(dataloader, desc='test step', leave=False):
            X, y = X.to(device), y.to(device)
            
            test_pred_logits = model(X)
            
            loss = loss_fn(test_pred_logits, y)
            test_loss += loss.item()
            
            test_pred_labels = test_pred_logits.argmax(dim=1)
            test_acc += ((test_pred_labels == y).sum().item() / len(test_pred_labels))
    
    test_loss = test_loss / len(dataloader)
    test_acc = test_acc / len(dataloader)
    
    return test_loss, test_acc
  • Early Stop
class EarlyStopper(object):
    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_loss = np.inf
        self.save_path = save_path
    
    def is_continuable(self, model, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.trial_counter = 0
            torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False
    
    def get_best_model(self, device):
        return torch.load(self.save_path).to(device)
  • Plot Loss
import matplotlib.pyplot as plt

def plot_loss_curves(results):
    loss = results['train_loss']
    test_loss = results['test_loss']
    
    accuracy = results['train_acc']
    test_accuracy = results['test_acc']
    
    epochs = range(len(results['train_loss']))
    
    plt.figure(figsize=(15, 7))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, loss, label='Train Loss')
    plt.plot(epochs, test_loss, label='Test Loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs, accuracy, label='Train Accuracy')
    plt.plot(epochs, test_accuracy, label='Test Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.legend()
  • Main
def main(
        model:torch.nn.Module,
        train_dataloader:torch.utils.data.DataLoader,
        test_dataloader:torch.utils.data.DataLoader,
        optimizer:torch.optim.Optimizer,
        early_stopper,
        loss_fn:torch.nn.Module,
        device:str,
        epochs:int=5
):
    results = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc': []
    }
    
    model.to(device)
    
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device
        )
        test_loss, test_acc = test_step(
            model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn,
            device=device
        )
        
        print(
            f'Epoch: {epoch + 1} | '
            f'train_loss: {train_loss:.4f} | '
            f'train_acc: {train_acc:.4f} | '
            f'test_loss: {test_loss:.4f} | '
            f'test_acc: {test_acc:.4f}'
        )
        
        results['train_loss'].append(train_loss)
        results['train_acc'].append(train_acc)
        results['test_loss'].append(test_loss)
        results['test_acc'].append(test_acc)
        
        if not early_stopper.is_continuable(model, test_loss):
            print(f'validation: best loss: {early_stopper.best_loss}')
            break
    
    return results

 

Training

  • K-Fold
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True)
  • Optimizer
optimizer = torch.optim.AdamW(params=cnn_model.parameters(), lr=0.001)
  • Early Stopper
early_stopper = EarlyStopper(num_trials=5, save_path='./trained_model.pth')
  • Loss Function
loss_fn = nn.BCEWithLogitsLoss()
  • Training
from torch.utils.data import DataLoader

for tr_indexs, val_indexs in cv.split(df_ko, df_ko['label']):
    label_train = df_ko['label'].iloc[tr_indexs]
    label_valid = df_ko['label'].iloc[val_indexs]
    
    feature_train = df_ko['document'].iloc[tr_indexs]
    feature_valid = df_ko['document'].iloc[val_indexs]
    
    dt_train = ReviewDataset(vocab=vocab, tokenizer=tokenizer, features=feature_train.tolist(), targets=label_train.tolist())
    dt_test = ReviewDataset(vocab=vocab, tokenizer=tokenizer, features=feature_valid.tolist(), targets=label_valid.tolist())
    
    dl_train = DataLoader(dt_train, batch_size=256, shuffle=True)
    dl_test = DataLoader(dt_test, batch_size=256, shuffle=True)
    
    result = main(
        model=cnn.model,
        train_dataloader=dl_train,
        test_dataloader=dl_test,
        optimizer=optimizer,
        early_stopper=early_stopper,
        loss_fn=loss_fn,
        device=device,
        epochs=10
    )

 

 

1-3. CNN 1D Model with HPO

 

Install

  • Ray Tune
    • 분산 하이퍼파라미터 튜닝을 위한 업계 표준 도구
    • 최신 하이퍼파라미터 검색 알고리즘 포함
    • TensorBoard 및 기타 분석 라이브러리와 통합
    • Ray의 분산 기계 학습 엔진을 통해 학습 지원
!pip install -q ray[tune]			# HPO 튜닝 모듈
!pip install -U torchtext==0.15.2

 

Import

import numpy as np
import pandas as pd

import torch
from torch import nn
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.utils.data import random_split

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import spacy

from ray import tune
from ray import train
from ray.train import get_checkpoint, Checkpoint
from ray.tune.schedulers import ASHAScheduler

from functools import partial

import os
import tempfile
from pathlib import Path
import ray.cloudpickle as pickle

from tqdm.auto import tqdm

 

Global Variables

import easydict

args = easydict.EasyDict()

args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
args.epoches = 50
args.trials = 5
args.save_model_name = './best_trained_model.pth'

 

Load Data

from google.colab import drive
drive.mount('/content/data')
default_path = ''
df_iphone = pd.read_csv(default_path + 'iphone.csv')
df_iphone.shape
# (3062, 11)

 

Engine

  • Reset Seeds
import random
import os

def reset_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
  • Train Loop
def train_loop(model, dataloader, loss_fn, optimizer, device):
    model.train()
    
    epoch_loss, total_acc = 0, 0
    
    sig = torch.nn.Sigmoid()
    
    for batch in dataloader:
        pred = model(batch['features'].to(device))
        
        loss = loss_fn(pred, batch['target'].to(device))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        pred_ = sig(pred)
        pred_ = pred_.to('cpu').detach().numpy()
        pred_ = (pred_ > 0.5).astype(int)
        total_acc += accuracy_score(batch['target'].to('cpu').numpy(), pred_)
    
    epoch_loss /= len(dataloader)
    total_acc /= len(dataloader)
    
    return epoch_loss, total_acc
  • Test Loop
@torch.inference_mode()
def test_loop(model, dataloader, loss_fn, device):
    model.eval()
    
    epoch_loss, total_acc = 0, 0
    pred_list = []
    
    sig = torch.nn.Sigmoid()
    
    for batch in dataloader:
        pred = model(batch['features'].to(device))
        
        pred_ = sig(pred)
        pred_ = pred_.to('cpu').numpy()
        pred_list.append(pred_)
        
        if batch.get('target') is not None:
            loss = loss_fn(pred, batch['target'].to(device))
            epoch_loss += loss.item()
            
            pred_ = (pred_ > 0.5).astype(int)
            total_acc += accuracy_score(batch['target'].to('cpu').numpy(), pred_)
    
    epoch_loss /= len(dataloader)
    total_acc /= len(dataloader)
    
    return epoch_loss, total_acc
  • Early Stop
class EarlyStopper(object):
    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_loss = np.int
        self.save_path = save_path
    
    def is_continuable(self, model, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.trial_counter = 0
            torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False
    
    def get_best_model(self, device=args.device):
        return torch.load(self.save_path).to(device)

 

Model

  • Embedding Layer
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_len, embedding=128) -> None:
        super().__init__()
        self.emb = nn.Embedding(vocab_len, embedding)
    
    def forward(self, x):
        return self.emb(x)
  • CNN Layer
class CNN1DLayer(nn.Module):
    def __init__(self, embedding) -> None:
        super().__init__()
        self.cnn_1d = nn.Sequential(
            nn.Conv1d(in_channels=embedding, out_channels=embedding*2, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(in_channels=embedding*2, out_channels=embedding*4, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.AdaptiveAvgPool1d(1)
        )
    
    def forward(self, x):
        pre_x = x_permute(0, 2, 1)
        out = self.cnn_1d(pre_x)
        return out
  • FC Layer
class FCLayer(nn.Module):
    def __init__(self, embedding, target_size=1) -> None:
        super().__init__()
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(embedding*4, target_size)
        )
    
    def forward(self, x):
        return self.fc(x)
  • CNN 1D Model
class CNN1DModel(nn.Module):
    def __init__(self, vocab_len, embedding=128, target_size=1) -> None:
        super().__init__()
        self.emb = EmbeddingLayer(vocab_len=vocab_len, embedding=embedding)
        self.cnn = CNN1DLayer(embedding=embedding)
        self.fc = FCLayer(embedding=embedding, target_size=target_size)
    
    def forward(self, x):
        emb_out = self.emb(x)
        cnn_out = self.cnn(emb_out)
        fc_out = self.fc(cnn_out)
        return fc_out

 

Text Preprocessing

df_iphone.dropna(inplace=True)

df_iphone['feature'] = df_iphone.apply(
    lambda row: str(row['reviewTitle']).strip() + ' ' + str(row['reviewDescription']).strip(),
    axis=1
)

df_iphone['target'] = df_iphone['ratingScore'].map(
    lambda x: 1 if x >= 5 else 0
)
  • Tokenization
import spacy

nlp = spacy.load('en_core_web_sm')

def tokenizer(text):
    tokens = nlp(text)
    return [token.lemma_ for token in tokens if token.tag_[0] in 'NVJ']

def yield_tokens(datas, tokenizer):
    for data in tqdm(datas, desc='tokenizing...', leave=False):
        yield tokenizer(data)

gen = yield_tokens(df_iphone['feature'], tokenizer)

vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])
len(vocab)
# 6084

 

Dataset

def main_by_ray(config, data_dir=None):
    net = CNN1DModel(len(vocab), config['embedding_dim'])
    
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda:0'
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    
    net.to(device)
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(params=net.parameters(), lr=config['lr'])
    
    checkpoint = get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as checkpoint_dir:
            data_path = Path(checkpoint_dir) / 'data.pkl'
            with open(data_path, 'rb') as fp:
                checkpoint_state = pickle.load(fp)
            
            start_epoch = checkpoint_state['epoch']
            net.load_state_dict(checkpoint_state['net_state_dict'])
            optimizer.load_state_dict(checkpoint_state['optimizer_state_dict'])
    else:
        start_epoch = 0
    
    dt_train = MyDataset(vocab, tokenizer, df_train)
    dt_test = MyDataset(vocab, tokenizer, df_test)
    
    dl_train = DataLoader(dt_train, batch_size=config['batch_size'], shuffle=True)
    dl_test = DataLoader(dt_test, batch_size=config['batch_size'], shuffle=False)
    
    for epoch in range(start_epoch, args.epoches):
        train_loss, train_acc = train_loop(
            model=net, dataloader=dl_train, loss_fn=loss_fn, optimizer=optimizer, device=device
        )
        
        test_loss, test_acc = test_loop(
            model=net, dataloader=dl_test, loss_fn=loss_fn, device=device
        )
        
        checkpoint_data = {
            'epoch': epoch,
            'net_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }
        
        with tempfile.TemporaryDirectory() as checkpoint_dir:
            data_path = Path(checkpoint_dir) / 'data.pkl'
            with open(data_path, 'wb') as fp:
                pickle.dump(checkpoint_data, fp)

 

Training

  • 리소스가 부족하여 정상적으로 실행하지 못하였다.
df_train, df_test = train_test_split(df_iphone, test_size=0.2, shuffle=True, stratify=df_iphone['target'])

def trial_str_creator(trial):
    return '{}_{}_trial'.format(trial.trainable_name, trial.trial_id)

def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath('./load_data')
    
    config = {
        'embedding_dim': tune.choice([64, 128, 256]),
        'lr': tune.loguniform(1e-4, 1e-1),
        'batch_size': tune.choice([16, 32, 64])
    }
    
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2
    )
    
    reporter = tune.JupyterNotebookReporter(
        metric_columns=['loss', 'accuracy', 'training_iteration']
    )
    
    result = tune.run(
        partial(main_by_ray, data_dir=data_dir),
        resources_per_tirla={
            'cpu': 6,
            'gpu': gpus_per_trial
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        trial_dirname_creator=trial_str_creator
    )
    
    return result

result = main(num_samples=4, max_num_epochs=5)