39일차. 자연어 데이터 준비 - 형태소 분석 & 어휘집 & Padding

39일 차 회고.

아직까지는 수업에 흥미가 있어서 나쁘지 않은 것 같다. 그런데 코딩 실력이나 설계하는 능력은 좀 떨어지는 것 같아서 이 부분에 대해서 더 노력을 해야 할 것 같다. 내일 SQLD 시험을 보는데 일단 여기에 집중을 하고 나머지는 주말에 생각해봐야 할 것 같다.

1. 형태소 분석

1-1. 영어 형태소 분석기

nltk

python에서 가장 오래되고 유명한 자연어 처리 라이브러리
한국어 미지원

import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

sentence = """
At eight o'clock on Thursday morning
Authur didn't feel very good.
"""

tokens = nltk.word_tokenize(sentence)
tokens[:5]
# ['At', 'eight', "o'clock", 'on', 'Thursday']

tagged = nltk.pos_tag(tokens)
tagged[:5]
# [('At', 'IN'),
#  ('eight', 'CD'),
#  ("o'clock", 'NN'),
#  ('on', 'IN'),
#  ('Thursday', 'NNP')]

# N*: 명사
# V*: 동사
lst = []
for token, pos in tagged:
    if pos.startswith('N') or pos.startswith('V'):
        lst.append(token)
lst
# ["o'clock", 'Thursday', 'morning', 'Arthur', 'did', 'feel']

lst = [
    token for token, pos in tagged if pos.startswith('N') or pos.startswith('V')
]
lst
# ["o'clock", 'Thursday', 'morning', 'Arthur', 'did', 'feel']

spacy

자연어 처리를 위한 python 기반의 오픈 소스 라이브러리
다국어 지원

import spacy
from spacy.lang.en.examples import sentences

sentences
# ['Apple is looking at buying U.K. startup for $1 billion',
#  'Autonomous cars shift insurance liability toward manufacturers',
#  'San Francisco considers banning sidewalk delivery robots',
#  'London is a big city in the United Kingdom.',
#  'Where are you?',
#  'Who is the president of France?',
#  'What is the capital of the United States?',
#  'When was Barack Obama born?']

nlp = spacy.load('en_core_web_sm')

sentences[0]
# 'Apple is looking at buying U.K. startup for $1 billion'

doc = nlp(sentences[0])
doc
# Apple is looking at buying U.K. startup for $1 billion

print(doc.text)
print('-' * 70)
print('단어', '원형', '품사', '태그', '의존성', '모양', '알파벳', '금칙어', sep='\t')
for token in doc:
    print(
        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        token.dep_,
        token.shape_,
        token.is_alpha,
        token.is_stop,
        sep='\t'
    )
# Apple is looking at buying U.K. startup for $1 billion
# ----------------------------------------------------------------------
# 단어		원형	품사	태그	의존성		모양	알파벳	금칙어
# Apple		Apple	PROPN	NNP	nsubj		Xxxxx	True	False
# is		be	AUX	VBZ	aux		xx	True	True
# looking	look	VERB	VBG	ROOT		xxxx	True	False
# at		at	ADP	IN	prep		xx	True	True
# buying	buy	VERB	VBG	pcomp		xxxx	True	False
# U.K.		U.K.	PROPN	NNP	dobj		X.X.	False	False
# startup	startup	NOUN	NN	dep		xxxx	True	False
# for		for	ADP	IN	prep		xxx	True	True
# $		$	SYM	$	quantmod	$	False	False
# 1		1	NUM	CD	compound	d	False	False
# billion	billion	NUM	CD	pobj		xxxx	True	False

!python -m spacy download ko_core_news_sm
import spacy
from spacy.lang.ko.examples import sentences
import locale

def getpreferredencoding(do_setlocale=True):
    return 'UTF-8'

locale.getpreferredencoding = getpreferredencoding

sentences
# ['애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.',
#  '자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다',
#  '샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.',
#  '런던은 영국의 수도이자 가장 큰 도시입니다.']

nlp = spacy.load('ko_core_news_sm')

doc = nlp(sentences[0])

print(doc.text)
print('-' * 92)
print('단어', '원형', '품사', '태그', '의존성', '모양', '알파벳', '금칙어', sep='\t')
for token in doc:
    print(
        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        token.dep_,
        token.shape_,
        token.is_alpha,
        token.is_stop,
        sep='\t'
    )
# 애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.
# --------------------------------------------------------------------------------------------
# 단어		원형		품사	태그		의존성		모양	알파벳	금칙어
# 애플이		애플+이		NOUN	ncn+jxt		dislocated	xxx	True	False
# 영국의		영국+의		PROPN	nq+jcm		nmod		xxx	True	False
# 스타트업을	스타트업+을	NOUN	ncn+jcs		nsubj		xxxx	True	False
# 10억		10+억		NUM	nnc+nnc		compound	ddx	False	False
# 달러에		달러+에		ADV	nbu+jca		obl		xxx	True	False
# 인수하는	인수+하+는	VERB	ncpa+xsv+etm	acl		xxxx	True	False
# 것을		것+을		NOUN	nbn+jco		obj		xx	True	False
# 알아보고	알아보+고	AUX	pvg+ecx+px+ecx	ROOT		xxxx	True	False
# 있다		있+다		AUX	px+ef		aux		xx	True	False
# .		.		PUNCT	sf		punct		.	False	False

doc = nlp('아버지가방에들어가신다.')

print(doc.text)
print('-' * 92)
print('단어', '원형', '품사', '태그', '의존성', '모양', '알파벳', '금칙어', sep='\t')
for token in doc:
    print(
        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        token.dep_,
        token.shape_,
        token.is_alpha,
        token.is_stop,
        sep='\t'
    )
# 아버지가방에들어가신다.
# --------------------------------------------------------------------------------------------
# 단어			원형			품사	태그	의존성	모양	알파벳	금칙어
# 아버지가방에들어가신다	아버지가방에들어가신다	ADJ	paa+ef	ROOT	xxxx	True	False
# .			.			PUNCT	sf	punct	.	False	False

1-2. 한국어 전용 형태소 분석기

KoNLPy

!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
!bash /content/Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab_light_220429.sh

Okt(Twitter)

from konlpy.tag import Okt

okt = Okt()

sentences
# ['애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.',
#  '자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다',
#  '샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.',
#  '런던은 영국의 수도이자 가장 큰 도시입니다.']

okt.pos(sentences[0])
# [('애플', 'Noun'),
#  ('이', 'Josa'),
#  ('영국', 'Noun'),
#  ('의', 'Josa'),
#  ('스타트업', 'Noun')]

result = []
for sentence in sentences:
    result.append([
        token[0] for token in okt.pos(sentence) if token[1][0] in 'NVJ'
        # token for token, pos in okt.pos(sentence) if pos[0] in 'NVJ'
    ])
result[0][:5]
# ['애플', '이', '영국', '의', '스타트업']

okt.pos('아버지가방에들어가신다.')
# [('아버지', 'Noun'),
#  ('가방', 'Noun'),
#  ('에', 'Josa'),
#  ('들어가신다', 'Verb'),
#  ('.', 'Punctuation')]

Mecab

from konlpy.tag import Mecab

nlp = Mecab()

nlp.pos(sentences[0])[:5]
# [('애플', 'NNP'), ('이', 'JKS'), ('영국', 'NNP'), ('의', 'JKG'), ('스타트업', 'NNG')]

result = []
for sentence in sentences:
    result.append([
        token[0] for token in okt.pos(sentence) if token[1][0] in 'NVJ'
        # token for token, pos in okt.pos(sentence) if pos[0] in 'NVJ'
    ])
result[0][:5]
# ['애플', '이', '영국', '의', '스타트업']

nlp.pos('아버지가방에들어가신다.')
# [('아버지', 'NNG'),
#  ('가', 'JKS'),
#  ('방', 'NNG'),
#  ('에', 'JKB'),
#  ('들어가', 'VV'),
#  ('신다', 'EP+EF'),
#  ('.', 'SF')]

Kiwi

형태소 분석

!pip install kiwipiepy
from kiwipiepy import Kiwi

kiwi = Kiwi()

result = kiwi.tokenize(sentences[0])[:5]
result
# [Token(form='애플', tag='NNP', start=0, len=2),
#  Token(form='이', tag='JKS', start=2, len=1),
#  Token(form='영국', tag='NNP', start=4, len=2),
#  Token(form='의', tag='JKG', start=6, len=1),
#  Token(form='스타트업', tag='NNG', start=8, len=4)]

result = []
for sentence in sentences:
    result.append([
        token.form for token in kiwi.tokenize(sentence) if token.tag[0] in 'NV'
    ])
result[0][:5]
# ['애플', '영국', '스타트업', '억', '달러']

kiwi.tokenize('아버지가방에들어가신다.')
# [Token(form='아버지', tag='NNG', start=0, len=3),
#  Token(form='가', tag='JKS', start=3, len=1),
#  Token(form='방', tag='NNG', start=4, len=1),
#  Token(form='에', tag='JKB', start=5, len=1),
#  Token(form='들어가', tag='VV', start=6, len=3),
#  Token(form='시', tag='EP', start=9, len=1),
#  Token(form='ᆫ다', tag='EF', start=9, len=2),
#  Token(form='.', tag='SF', start=11, len=1)]

불용어

from kiwipiepy.utils import Stopwords

stopwords = Stopwords
list(stopwords.stopwords)[:5]

result = kiwi.tokenize(sentences, stopwords=stopwords)
tokens = next(iter(result))
tokens
# [Token(form='애플', tag='NNP', start=0, len=2),
#  Token(form='영국', tag='NNP', start=4, len=2),
#  Token(form='스타트업', tag='NNG', start=8, len=4),
#  Token(form='10', tag='SN', start=14, len=2),
#  Token(form='억', tag='NR', start=16, len=1),
#  Token(form='달러', tag='NNB', start=18, len=2),
#  Token(form='인수', tag='NNG', start=22, len=2),
#  Token(form='알아보', tag='VV', start=30, len=3)]

stopwords.add(('애플', 'NNP'))

True if ('애플', 'NNP') in list(stopwords.stopwords) else False
# True

result = kiwi.tokenize(sentences.stopwords=stopwords)

tokens = next(iter(result))
tokens
# [Token(form='영국', tag='NNP', start=4, len=2),
#  Token(form='스타트업', tag='NNG', start=8, len=4),
#  Token(form='10', tag='SN', start=14, len=2),
#  Token(form='억', tag='NR', start=16, len=1),
#  Token(form='달러', tag='NNB', start=18, len=2),
#  Token(form='인수', tag='NNG', start=22, len=2),
#  Token(form='알아보', tag='VV', start=30, len=3)]

stopwords.remove(('애플', 'NNP'))

True if ('애플', 'NNP') in list(stopwords.stopwords) else False
# False

result = kiwi.tokenize(sentences.stopwords=stopwords)

tokens = next(iter(result))
tokens
# [Token(form='애플', tag='NNP', start=0, len=2),
#  Token(form='영국', tag='NNP', start=4, len=2),
#  Token(form='스타트업', tag='NNG', start=8, len=4),
#  Token(form='10', tag='SN', start=14, len=2),
#  Token(form='억', tag='NR', start=16, len=1),
#  Token(form='달러', tag='NNB', start=18, len=2),
#  Token(form='인수', tag='NNG', start=22, len=2),
#  Token(form='알아보', tag='VV', start=30, len=3)]

1-3. Vocabulary(어휘집)

어휘집 기능

전체 토큰의 수
- Embedding Model 학습 시, 사용할 전체 토큰의 개수
토큰 to 인덱스
- Embedding Model에서는 토큰이 아니라 인덱스를 입력으로 사용한다.
인덱스 to 토큰
- Model이 예측한 출력값은 인덱스로 나타나기 때문에 토큰으로 변환한다.

Class

class Vocab:
    def __init__(self, tokenize, sentences, special_tokens:list=None) -> None:
        self.special_tokens = ['<pad>', '<unk>']
        if special_tokens is not None:
            self.special_tokens += list(set(special_tokens))
        
        self.idx_to_token = self.special_tokens + tokenize(sentences)
        self.token_to_idx = {
            token: index for index, token in enumerate(self, idx_to_token)
        }
    
    def token_to_index(self, token):
        return self.token_to_idx[token]
    
    def __getitem__(self, index):
        return self.idx_to_token[index]
    
    def __len__(self):
        return len(self.idx_to_token)

한국어

from spacy.lang.ko.examples import sentences

sentences
# ['애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.',
#  '자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다',
#  '샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.',
#  '런던은 영국의 수도이자 가장 큰 도시입니다.']

!pip install kiwipiepy
from kiwipiepy import Kiwi
from tqdm import tqdm

def kiwi_tokenize(sentences):
    kiwi = Kiwi()
    
    result = []
    for sentence in tqdm(sentences, desc='tokenize'):
        result.extend([
            token.form for token in kiwi.tokenize(sentence) if token.tag[0] in 'NV'
        ])
    return list(set(result))

vocab = Vocab(kiwi_tokenize, sentences)

len(vocab)
# 36

vocab[0], vocab[1], vocab[2]
# ('<pad>', '<unk>', '가')

vocab.token_to_index('<pad>')
# 0

영어

from spacy.lang.en.examples import sentences

sentences
# ['Apple is looking at buying U.K. startup for $1 billion',
#  'Autonomous cars shift insurance liability toward manufacturers',
#  'San Francisco considers banning sidewalk delivery robots',
#  'London is a big city in the United Kingdom.',
#  'Where are you?',
#  'Who is the president of France?',
#  'What is the capital of the United States?',
#  'When was Barack Obama born?']

def en_tokenize(sentences):
    nlp = spacy.load('en_core_web_sm')
    
    result = []
    for sentence in tqdm(sentences, desc='tokenize'):
        result.extend([
            token.text for token in nlp(sentence) if token.pos_[0] in 'PAV'
        ])
    return list(set(result))

vocab = Vocab(en_tokenize, sentences)

len(vocab)
# 34

vocab[0], vocab[1], vocab[2]
# ('<pad>', '<unk>', 'you')

vocab.token_to_index('<pad>')
# 0

torchtext

!pip install -U torchtext==0.15.2
from torchtext.vocab import build_vocab_from_iterator

def kiwi_tokenizer(sentence, kiwi):
    return [
        token.form for token in kiwi.tokenize(sentence) if token.tag[0] in 'NV'
    ]

def yield_tokens(sentences:list, tokenizer):
    kiwi = Kiwi()
    for sentence in tqdm(sentences, desc='tokenizer'):
        yield tokenizer(sentence, kiwi)

from spacy.lang.ko.examples import sentences

sentences
# ['애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.',
#  '자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다',
#  '샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.',
#  '런던은 영국의 수도이자 가장 큰 도시입니다.']

token_gen = yield_tokens(sentences, kiwi_tokenizer)

vocab = build_vocab_from_iterator(
    token_gen,
    specials=['<pad>', '<unk>']
)
vocab.set_default_index(vocab['<pad>'])

len(vocab)
# 36

vocab.lookup_tokens([0, 1, 2, 3])
# ['<pad>', '<unk>', '이', '영국']

vocab(['<pad>', '<unk'>, '이', '영국'])
# [0, 1, 2, 3]

1-4. Padding

from spacy.lang.ko.examples import sentences

sentences
# ['애플이 영국의 스타트업을 10억 달러에 인수하는 것을 알아보고 있다.',
#  '자율주행 자동차의 손해 배상 책임이 제조 업체로 옮겨 가다',
#  '샌프란시스코 시가 자동 배달 로봇의 보도 주행 금지를 검토 중이라고 합니다.',
#  '런던은 영국의 수도이자 가장 큰 도시입니다.']

kiwi = Kiwi()

tokenized_indices = [
    vocab(kiwi_tokenizer(sentence, kiwi)) for sentence in sentences
]
tokenized_indices
# [[22, 3, 19, 23, 9, 26, 7, 21, 27],
#  [30, 4, 29, 17, 14, 33, 31, 24, 25, 5],
#  [16, 20, 28, 13, 12, 15, 4, 8, 6, 32, 2, 35],
#  [11, 3, 18, 2, 34, 10, 2]]

sentence_max_len = max([len(tokenized) for tokenized in tokenized_indices])
sentence_max_len
# 12

vocab(['<pad>'])
# [0]

tokenized_padding = [
    tokenized + vocab(['<pad>']) * (sentence_max_len - len(tokenized)) for tokenized in tokenized_indices
]

tokenized_padding
# [[22, 3, 19, 23, 9, 26, 7, 21, 27, 0, 0, 0],
#  [30, 4, 29, 17, 14, 33, 31, 24, 25, 5, 0, 0],
#  [16, 20, 28, 13, 12, 15, 4, 8, 6, 32, 2, 35],
#  [11, 3, 18, 2, 34, 10, 2, 0, 0, 0, 0, 0]]

'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글

41일차. 자연어 딥러닝 - CNN 1D Model (0)	2025.03.11
40일차. 자연어 데이터 준비 - Text Preprocessing & 1D CNN (0)	2025.03.10
38일차. 자연어 데이터 준비 - NLP & Integer Encoding & Word2Vec (0)	2025.03.06
36-37일차. 단위 프로젝트(데이터 분석과 머신러닝, 딥러닝) (0)	2025.03.05
35일차. Deep Learning - 추천 시스템 (0)	2025.02.28

이네의 개발 노트

39일차. 자연어 데이터 준비 - 형태소 분석 & 어휘집 & Padding

1. 형태소 분석

1-1. 영어 형태소 분석기

1-2. 한국어 전용 형태소 분석기

1-3. Vocabulary(어휘집)

1-4. Padding

'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글

티스토리툴바

39일차. 자연어 데이터 준비 - 형태소 분석 & 어휘집 & Padding

1. 형태소 분석

1-1. 영어 형태소 분석기

1-2. 한국어 전용 형태소 분석기

1-3. Vocabulary(어휘집)

1-4. Padding

'SK네트웍스 Family AI캠프 10기 > Daily 회고' 카테고리의 다른 글

'SK네트웍스 Family AI캠프 10기/Daily 회고' Related Articles

티스토리툴바