본문 바로가기

SK네트웍스 Family AI캠프 10기/Daily 회고

40일차. 자연어 데이터 준비 - Text Preprocessing & 1D CNN

더보기

 

40일 차 회고.

 

 계속 오류가 떠서 뭐가 문젠지 계속 봤는데 오타 때문이었다. 오타 때문에 오류가 발생하면 찾기 너무 힘든 것 같다. 다음부터는 철자가 틀리지 않았는지도 꼼꼼하게 살펴야 할 것 같다. 그리고 빅데이터분석기사랑 코딩테스트 준비를 해야하는데 공채 시기라서 지원서 쓰느라 너무 바쁘다. 이번에는 미루지 말고 빨리 지원서를 써야겠다.

 

 

 

 

1. Text Preprocessing 문제

 

 

1-1. 영어 영화 리뷰

 

Load Data

from google.colab import drive
drive.mount('/content/data')
import pandas as pd

data_path = ''
df = pd.read_csv(data_path + '/IMDB Dataset.csv')
df.shape
# (50000, 2)

df = df[:5000]
df.shape
# (5000, 2)

 

Cleaning

df['review'] = df['review'].map(lambda x: x.replade('<br />', ''))

 

Tokenizer

  • Tokenizer 생성
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
  • Stemming / Stopword
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def tokenizer(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    tagged_tokens = pos_tag(tokens)
    tagged_tokens = [token for token, tag in tagged_tokens if tag[0] in 'NV']
    
    return tagged_tokens
  • 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab_from_iterator
from tqdm.auto import tqdm

def yield_tokens(data, tokenizer):
    for text in tqdm(data):
        yield tokenizer(text)

gen = yield_tokens(df['review'], tokenizer)

vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])

len(vocab)
# 38565
  • 토큰화 진행
features = [vocab(tokenizer(text)) for text in tqdm(df['review'])]

 

Padding

max_len = max([len(token) for token in features])
max_len
# 632

features = [token + vocab(['<pad>']) * (max_len - len(token)) for token in tqdm(features)]

 

Dataset Class

import numpy as np
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, vocab, tokenizer, features, targets=None) -> None:
        super().__init__()
        self.features = np.array(self.__do_tokenize(vocab, tokenizer, features))
        if targets is not None:
            self.targets = np.array([1 if label == 'positive' else 0 for label in targets]).reshape(-1, 1)
        else:
            self.targets = np.array(targets).reshape(-1, 1)
    
    def __do_tokenize(self):
        _features = [vocab(tokenizer(text)) for text in tqdm(features)]
        _max_len = max([len(token) for token in _features])
        return [token + vocab(['<pad>']) * (_max_len - len(token)) for token in tqdm(_features)]
    
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index:int):
        feature = torch.LongTensor(self.features[index])
        target = torch.Tensor(self.targets[index])
        return feature, target

dataset = ReviewDataset(
    vocab=vocab,
    tokenizer=tokenizer,
    features=df['review'].tolist(),
    targets=df['sentiment'].tolist()
)
len(dataset)
# 5000

feature, target = dataset[0]
feature.shape, target.shape
# (torch.Size([632]), torch.Size([1]))

 

DataLoader

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
len(dataloader)
# 20

features, targets = next(iter(dataloader))
features.shape, targets.shape
# (torch.Size([256, 632]), torch.Size([256, 1]))

 

 

1-2. 한국어 영화 리뷰

 

Load Data

from google.colab import drive
drive.mount('/content/data')
import pandas as pd

data_path = ''
df = pd.read_csv(data_path + 'ratings_train.txt', sep='\t')
df.shape
# (150000, 3)

df = df[:5000]
df.shape
# (5000, 3)

 

Cleaning

df['document'] = df['document'].map(
    lambda x: x.replace('..', '').replace('ㅡㅡ', '').replace('ㅠ.ㅠ', '')\
    . replace('ㅋ', '').replace('ㅎ', '').replace('~', '')
)

 

Tokenization

!pip install kiwipiepy
  • Tokenizer 생성
from kiwipiepy import Kiwi

kiwi = Kiwi()
  • Stemming / Stopword
from kiwipiepy.utils import Stopwords

stopwords = Stopwords()

def tokenizer(text):
    tokens = kiwi.tokenize(text, stopwords=stopwords)
    
    return [t.form for t in tokens if t.tag[0] in 'NJMV']
  • 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab from_iterator
from tqdm.auto import tqdm

def yield_tokens(data, tokenizer):
    for text in tqdm(data):
        yield tokenizer(text)

gen = yield_tokens(df['document'], tokenizer)

vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])

len(vocab)
# 7124
  • 토큰화 진행
features = [vocab(tokenizer(text)) for text in tqdm(df['documnet'])]

 

Padding

max_len = max([len(token) for token in features])
max_len
# 41

features = [token + vocab(['<pad>']) * (max_len - len(token)) for token in tqdm(features)]

 

Dataset Class

import numpy as np
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, vocab, tokenizer, features, targets=None) -> None:
        super().__init__()
        self.features = np.array(self.__do_tokenize(vocab, tokenizer, features))
        self.targets = np.array(targets).reshape(-1, 1)
    
    def __do_tokenize(self, vocab, tokenizer, features):
        _features = [vocab(tokenizer(text)) for text in tqdm(features)]
        _max_len = max([len(token) for token in _features]
        return [token + vocab(['<pad>']) * (_max_len - len(token)) for token in tqdm(_features)
    
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index:int):
        feature = torch.LongTensor(self.features[index])
        target = None
        if self.targets is not None:
            target = torch.Tensor(self.targets[index])
        return feature, target

dataset = ReviewDataset(
    vocab=vocab,
    tokenizer=tokenizer,
    features=df_ko['document'].tolist(),
    targets=df_ko['label'].tolist()
)
len(dataset)
# 5000

feature, target = dataset[0]
feature.shape, target.shape
# (torch.Size([41]), torch.Size([1]))

feature[:5], target
# (tensor([524,  10, 117,  11, 677]), tensor([0.]))

 

Data Loader

from torch.utils.data import DataLoader

dataloader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
len(dataloader)
# 20

features, targets = next(iter(dataloader))
features.shape, targets.shape
# (torch.Size([256, 41]), torch.Size([256, 1]))

 

 

 

2. CNN 1D

 

 

2-1. CNN 1D Model

 

Load Data

from google.colab import drive
drive.mount('/content/data')
import pandas as pd

data_path = ''
df = pd.read_csv(data_path + 'ratings_train.txt', sep='\t')
df.shape
# (150000, 3)
  • Split Train & Test Dataset
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df['label'])

 

Cleaning

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

df_train['document'] = df_train['document'].map(
    lambda x: x.replace('..', '').replace('ㅡㅡ', '')\
    .replace('ㅠ.ㅠ', '').replace('ㅋ', '').replace('ㅎ', '').replace('~', '').replace('^^', '')
)

df_test['document'] = df_test['document'].map(
    lambda x: x.replace('..', '').replace('ㅡㅡ', '')\
    .replace('ㅠ.ㅠ', '').replace('ㅋ', '').replace('ㅎ', '').replace('~', '').replace('^^', '')
)

 

Tokenization

!pip install kiwipiepy
  • Tokenizer 생성
from kiwipiepy import Kiwi

kiwi = Kiwi()
  • Stemming / Stopword
from kiwipiepy.utils import Stopwords

stopwords = Stopwords()

def tokenizer(text):
    tokens = kiwi.tokenize(text, stopwords=stopwords)
    
    return [t.form for t in tokens if t.tag[0] in 'NJMV']
  • 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab_from_iterator
from tqdm.auto import tqdm

def yield_tokens(data, tokenizer):
    for text in tqdm(data):
        yield tokenizer(text)

gen = yield_tokens(df_train['document'], tokenizer)

vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])

len(vocab)
# 36777

 

Dataset Class

import numpy as np
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, vocab, tokenizer, features, targets=None, max_len=None) -> None:
        super().__init__()
        self.max_len = max_len
        self.targets = np.array(targets).reshape(-1, 1)
        self.do_tokenize(vocab, tokenizer, features)
    
    def do_tokenize(self, vocab, tokenizer, features):
        _features = [vocab(tokenizer(text)) for text in tqdm(features)]
        
        if self.max_len is None:
            self.max_len = max([len(token) for token in _features])
        
        self.features = np.array([
            token + vocab(['<pad>']) * (self.max_len - len(token))
            if len(token) <= self.max_len else token[:self.max_len]
            for token in tqdm(_features)
        ])
    
    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index:int):
        feature = torch.LongTensor(self.features[index])
        target = None
        if self.targets is not None:
            target = torch.Tensor(self.targets[index])
        return feature, target

dt_train = ReviewDataset(
    vocab=vocab,
    tokenizer=tokenizer,
    features=df_train['document'].tolist(),
    targets=df_train['label'].tolist()
)
dt_test = ReviewDataset(
    vocab=vocab,
    tokenizer=tokenizer,
    features=df_test['document'].tolist(),
    targets=df_test['label'].tolist()
)
len(dt_train), len(dt_test)
# (104997, 104998)

train_feature, _ = dt_train[0]
test_feature, _ = dt_test[0]
train_feature.shape, test_feature.shape
# (torch.Size([54]), torch.Size([54]))