ai.stanford.edu/~amaas/data/sentiment/
์คํ ํผ๋ ๋ํ๊ต ์ธ๊ณต์ง๋ฅ ์ฐ๊ตฌํ์ ์๋ณธ ์๋ฃ๋ก classification ์งํ
import glob
import os
from random import shuffle
Data load
def preprocess_data(filepath):
positivie_path = os.path.join(filepath,'pos')
negative_path = os.path.join(filepath,'neg')
pos_label = 1
neg_label = 0
dataset = []
for filename in glob.glob(os.path.join(positivie_path,"*.txt")):
with open(filename,'r') as f:
dataset.append((pos_label, f.read()))
for filename in glob.glob(os.path.join(negative_path,"*.txt")):
with open(filename,'r') as f:
dataset.append((neg_label, f.read()))
shuffle(dataset)
return dataset
dataset = preprocess_data('๊ฐ์ธํ๊ฒฝ/aclimdb/train')
wordvectorize
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data
nltk์ TreebankwordTokenizer๋ฅผ ํตํด ํ ํฐํ ์ฌ์ฉ dataset์ ๋ฌธ์ฅ์ ํ ํฐํ ์งํ
def tokenize_and_vectorize(dataset):
tokenizer = TreebankWordTokenizer()
vectorized_data = []
for sample in dataset:
tokens = tokenizer.tokenize(sample[1])
sample_vecs = []
for token in tokens:
try:
sample_vecs.append(word_vectors[token])
except KeyError:
pass
vectorized_data.append(sample_vecs)
return vectorized_data
vectorized_data = tokenize_and_vectorize(dataset)
collect_expected ์์ sentiment label ๊ฐ ์ถ์ถ
def collect_expected(dataset):
expected = []
for sample in dataset:
expected.append(sample[0])
return expected
expected = collect_expected(dataset)
train test ๋น์จ 8 : 2
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = vectorized_data[split_point:]
maxlen ๋ฌธ์ฅ ๋น ๋จ์ด์ ๊ธธ์ด๋ฅผ 400์ผ๋ก ์ ํ embedding_dims ํฉ์ฑ๊ณฑ ์ ๊ฒฝ๋ง์ ์
๋ ฅํ ํ ํฐ ๋ฒกํฐ์ ๊ธธ์ด
filters ํ๋ จ์ ์ฌ์ฉํ ํํฐ ๊ฐ์ kernel_size ๊ฐ 1์ฐจ์ ํํฐ์ ๋๋น hidden_dims ์ ๊ฒฝ๋ง ๋์ ์๋ ์๋ฐฉํฅ ๋ด๋ฐ ์
maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2
pad_trunc maxlen(400)๋งํผ ๊ธธ์ด ์กฐ์ keras์ pad_sequence ์ ๊ฐ์ ์ญํ ์ ํ๋ค.
def pad_trunc(data,maxlen):
new_data = []
zero_vector = []
for _ in range(len(data[0][0])):
zero_vector.append(0.0)
for sample in data:
if len(sample) > maxlen:
temp = sample[:maxlen]
elif len(sample) < maxlen:
temp = sample
additional_elems = maxlen - len(sample)
for _ in range(additional_elems):
temp.append(zero_vector)
else:
temp = sample
new_data.append(temp)
return new_data
x_train = pad_trunc(x_train,maxlen)
x_test = pad_trunc(x_test,maxlen)
x_train = np.reshape(x_train,(len(x_train),maxlen,embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test,(len(x_test),maxlen,embedding_dims))
y_test = np.array(y_test)
๋ชจ๋ธ ๋น๋
- Sequential ์ ๊ฒฝ๋ง ๊ตฌ์ถ ํ ์์ฑ
- Conv1D๋ 1์ฐจ์์ผ๋ก kernel_size * filter ๊ฐ๊ฒฉ 1๋งํผ ์งํ ํ padding = 'valid' ์
๋ ฅ๊ณผ ๋์ผ
- relu ์ฐ์ฐ
model = Sequential()
model.add(Conv1D(
filters,
kernel_size,
padding='valid',
activation='relu',
strides=1,
input_shape=(maxlen,embedding_dims)))
Max pool ์ต๋๊ฐ์ ๊ธฐ์ค์ผ๋ก ํน์ง ์ถ์ถ
model.add(GlobalMaxPool1D())
'๐ฃ๏ธ Natural Language Processing' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[BERT Dict] NSP(Next Senctenct Prediction) Task (0) | 2022.05.07 |
---|---|
๋ฒํคํ (bucketing)์ ์ด์ฉํ ํ์ต ๋ณต์ก๋ ํด๊ฒฐ (0) | 2021.03.28 |
[doc2vec] ๋ฌธ์ ์ ์ฌ๋ ์ถ์ (0) | 2021.03.09 |
[Word2vec] ๋จ์ด ๊ด๊ณ ์๊ฐํ (0) | 2021.03.08 |
Word2vec Vs GloVe (0) | 2021.03.08 |