더보기
40일 차 회고.
계속 오류가 떠서 뭐가 문젠지 계속 봤는데 오타 때문이었다. 오타 때문에 오류가 발생하면 찾기 너무 힘든 것 같다. 다음부터는 철자가 틀리지 않았는지도 꼼꼼하게 살펴야 할 것 같다. 그리고 빅데이터분석기사랑 코딩테스트 준비를 해야하는데 공채 시기라서 지원서 쓰느라 너무 바쁘다. 이번에는 미루지 말고 빨리 지원서를 써야겠다.
1. Text Preprocessing 문제
1-1. 영어 영화 리뷰
Load Data
from google.colab import drive
drive.mount('/content/data')
import pandas as pd
data_path = ''
df = pd.read_csv(data_path + '/IMDB Dataset.csv')
df.shape
# (50000, 2)
df = df[:5000]
df.shape
# (5000, 2)
Cleaning
df['review'] = df['review'].map(lambda x: x.replade('<br />', ''))
Tokenizer
- Tokenizer 생성
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
- Stemming / Stopword
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def tokenizer(text):
text = text.lower()
tokens = nltk.word_tokenize(text)
tokens = [token for token in tokens if token.lower() not in stop_words]
tagged_tokens = pos_tag(tokens)
tagged_tokens = [token for token, tag in tagged_tokens if tag[0] in 'NV']
return tagged_tokens
- 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab_from_iterator
from tqdm.auto import tqdm
def yield_tokens(data, tokenizer):
for text in tqdm(data):
yield tokenizer(text)
gen = yield_tokens(df['review'], tokenizer)
vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])
len(vocab)
# 38565
- 토큰화 진행
features = [vocab(tokenizer(text)) for text in tqdm(df['review'])]
Padding
max_len = max([len(token) for token in features])
max_len
# 632
features = [token + vocab(['<pad>']) * (max_len - len(token)) for token in tqdm(features)]
Dataset Class
import numpy as np
import torch
from torch.utils.data import Dataset
class ReviewDataset(Dataset):
def __init__(self, vocab, tokenizer, features, targets=None) -> None:
super().__init__()
self.features = np.array(self.__do_tokenize(vocab, tokenizer, features))
if targets is not None:
self.targets = np.array([1 if label == 'positive' else 0 for label in targets]).reshape(-1, 1)
else:
self.targets = np.array(targets).reshape(-1, 1)
def __do_tokenize(self):
_features = [vocab(tokenizer(text)) for text in tqdm(features)]
_max_len = max([len(token) for token in _features])
return [token + vocab(['<pad>']) * (_max_len - len(token)) for token in tqdm(_features)]
def __len__(self):
return self.features.shape[0]
def __getitem__(self, index:int):
feature = torch.LongTensor(self.features[index])
target = torch.Tensor(self.targets[index])
return feature, target
dataset = ReviewDataset(
vocab=vocab,
tokenizer=tokenizer,
features=df['review'].tolist(),
targets=df['sentiment'].tolist()
)
len(dataset)
# 5000
feature, target = dataset[0]
feature.shape, target.shape
# (torch.Size([632]), torch.Size([1]))
DataLoader
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
len(dataloader)
# 20
features, targets = next(iter(dataloader))
features.shape, targets.shape
# (torch.Size([256, 632]), torch.Size([256, 1]))
1-2. 한국어 영화 리뷰
Load Data
from google.colab import drive
drive.mount('/content/data')
import pandas as pd
data_path = ''
df = pd.read_csv(data_path + 'ratings_train.txt', sep='\t')
df.shape
# (150000, 3)
df = df[:5000]
df.shape
# (5000, 3)
Cleaning
df['document'] = df['document'].map(
lambda x: x.replace('..', '').replace('ㅡㅡ', '').replace('ㅠ.ㅠ', '')\
. replace('ㅋ', '').replace('ㅎ', '').replace('~', '')
)
Tokenization
!pip install kiwipiepy
- Tokenizer 생성
from kiwipiepy import Kiwi
kiwi = Kiwi()
- Stemming / Stopword
from kiwipiepy.utils import Stopwords
stopwords = Stopwords()
def tokenizer(text):
tokens = kiwi.tokenize(text, stopwords=stopwords)
return [t.form for t in tokens if t.tag[0] in 'NJMV']
- 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab from_iterator
from tqdm.auto import tqdm
def yield_tokens(data, tokenizer):
for text in tqdm(data):
yield tokenizer(text)
gen = yield_tokens(df['document'], tokenizer)
vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])
len(vocab)
# 7124
- 토큰화 진행
features = [vocab(tokenizer(text)) for text in tqdm(df['documnet'])]
Padding
max_len = max([len(token) for token in features])
max_len
# 41
features = [token + vocab(['<pad>']) * (max_len - len(token)) for token in tqdm(features)]
Dataset Class
import numpy as np
import torch
from torch.utils.data import Dataset
class ReviewDataset(Dataset):
def __init__(self, vocab, tokenizer, features, targets=None) -> None:
super().__init__()
self.features = np.array(self.__do_tokenize(vocab, tokenizer, features))
self.targets = np.array(targets).reshape(-1, 1)
def __do_tokenize(self, vocab, tokenizer, features):
_features = [vocab(tokenizer(text)) for text in tqdm(features)]
_max_len = max([len(token) for token in _features]
return [token + vocab(['<pad>']) * (_max_len - len(token)) for token in tqdm(_features)
def __len__(self):
return self.features.shape[0]
def __getitem__(self, index:int):
feature = torch.LongTensor(self.features[index])
target = None
if self.targets is not None:
target = torch.Tensor(self.targets[index])
return feature, target
dataset = ReviewDataset(
vocab=vocab,
tokenizer=tokenizer,
features=df_ko['document'].tolist(),
targets=df_ko['label'].tolist()
)
len(dataset)
# 5000
feature, target = dataset[0]
feature.shape, target.shape
# (torch.Size([41]), torch.Size([1]))
feature[:5], target
# (tensor([524, 10, 117, 11, 677]), tensor([0.]))
Data Loader
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset=dataset, batch_size=256, shuffle=True)
len(dataloader)
# 20
features, targets = next(iter(dataloader))
features.shape, targets.shape
# (torch.Size([256, 41]), torch.Size([256, 1]))
2. CNN 1D
2-1. CNN 1D Model
Load Data
from google.colab import drive
drive.mount('/content/data')
import pandas as pd
data_path = ''
df = pd.read_csv(data_path + 'ratings_train.txt', sep='\t')
df.shape
# (150000, 3)
- Split Train & Test Dataset
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.3, shuffle=True, stratify=df['label'])
Cleaning
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)
df_train['document'] = df_train['document'].map(
lambda x: x.replace('..', '').replace('ㅡㅡ', '')\
.replace('ㅠ.ㅠ', '').replace('ㅋ', '').replace('ㅎ', '').replace('~', '').replace('^^', '')
)
df_test['document'] = df_test['document'].map(
lambda x: x.replace('..', '').replace('ㅡㅡ', '')\
.replace('ㅠ.ㅠ', '').replace('ㅋ', '').replace('ㅎ', '').replace('~', '').replace('^^', '')
)
Tokenization
!pip install kiwipiepy
- Tokenizer 생성
from kiwipiepy import Kiwi
kiwi = Kiwi()
- Stemming / Stopword
from kiwipiepy.utils import Stopwords
stopwords = Stopwords()
def tokenizer(text):
tokens = kiwi.tokenize(text, stopwords=stopwords)
return [t.form for t in tokens if t.tag[0] in 'NJMV']
- 어휘집
!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab_from_iterator
from tqdm.auto import tqdm
def yield_tokens(data, tokenizer):
for text in tqdm(data):
yield tokenizer(text)
gen = yield_tokens(df_train['document'], tokenizer)
vocab = build_vocab_from_iterator(gen, specials=['<pad>', '<unk>'])
vocab.set_default_index(vocab['<unk>'])
len(vocab)
# 36777
Dataset Class
import numpy as np
import torch
from torch.utils.data import Dataset
class ReviewDataset(Dataset):
def __init__(self, vocab, tokenizer, features, targets=None, max_len=None) -> None:
super().__init__()
self.max_len = max_len
self.targets = np.array(targets).reshape(-1, 1)
self.do_tokenize(vocab, tokenizer, features)
def do_tokenize(self, vocab, tokenizer, features):
_features = [vocab(tokenizer(text)) for text in tqdm(features)]
if self.max_len is None:
self.max_len = max([len(token) for token in _features])
self.features = np.array([
token + vocab(['<pad>']) * (self.max_len - len(token))
if len(token) <= self.max_len else token[:self.max_len]
for token in tqdm(_features)
])
def __len__(self):
return self.features.shape[0]
def __getitem__(self, index:int):
feature = torch.LongTensor(self.features[index])
target = None
if self.targets is not None:
target = torch.Tensor(self.targets[index])
return feature, target
dt_train = ReviewDataset(
vocab=vocab,
tokenizer=tokenizer,
features=df_train['document'].tolist(),
targets=df_train['label'].tolist()
)
dt_test = ReviewDataset(
vocab=vocab,
tokenizer=tokenizer,
features=df_test['document'].tolist(),
targets=df_test['label'].tolist()
)
len(dt_train), len(dt_test)
# (104997, 104998)
train_feature, _ = dt_train[0]
test_feature, _ = dt_test[0]
train_feature.shape, test_feature.shape
# (torch.Size([54]), torch.Size([54]))
'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글
| 42일차. 자연어 딥러닝 - RNN (0) | 2025.03.12 |
|---|---|
| 41일차. 자연어 딥러닝 - CNN 1D Model (0) | 2025.03.11 |
| 39일차. 자연어 데이터 준비 - 형태소 분석 & 어휘집 & Padding (0) | 2025.03.07 |
| 38일차. 자연어 데이터 준비 - NLP & Integer Encoding & Word2Vec (0) | 2025.03.06 |
| 36-37일차. 단위 프로젝트(데이터 분석과 머신러닝, 딥러닝) (0) | 2025.03.05 |