ai.stanford.edu/~amaas/data/sentiment/
Sentiment Analysis
Publications Using the Dataset Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (A
ai.stanford.edu
์คํ ํผ๋ ๋ํ๊ต ์ธ๊ณต์ง๋ฅ ์ฐ๊ตฌํ์ ์๋ณธ ์๋ฃ๋ก classification ์งํ
import glob
import os
from random import shuffle
Data load
def preprocess_data(filepath):
positivie_path = os.path.join(filepath,'pos')
negative_path = os.path.join(filepath,'neg')
pos_label = 1
neg_label = 0
dataset = []
for filename in glob.glob(os.path.join(positivie_path,"*.txt")):
with open(filename,'r') as f:
dataset.append((pos_label, f.read()))
for filename in glob.glob(os.path.join(negative_path,"*.txt")):
with open(filename,'r') as f:
dataset.append((neg_label, f.read()))
shuffle(dataset)
return dataset
dataset = preprocess_data('๊ฐ์ธํ๊ฒฝ/aclimdb/train')
wordvectorize
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data
nltk์ TreebankwordTokenizer๋ฅผ ํตํด ํ ํฐํ ์ฌ์ฉ dataset์ ๋ฌธ์ฅ์ ํ ํฐํ ์งํ
def tokenize_and_vectorize(dataset):
tokenizer = TreebankWordTokenizer()
vectorized_data = []
for sample in dataset:
tokens = tokenizer.tokenize(sample[1])
sample_vecs = []
for token in tokens:
try:
sample_vecs.append(word_vectors[token])
except KeyError:
pass
vectorized_data.append(sample_vecs)
return vectorized_data
vectorized_data = tokenize_and_vectorize(dataset)
collect_expected ์์ sentiment label ๊ฐ ์ถ์ถ
def collect_expected(dataset):
expected = []
for sample in dataset:
expected.append(sample[0])
return expected
expected = collect_expected(dataset)
train test ๋น์จ 8 : 2
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = vectorized_data[split_point:]
maxlen ๋ฌธ์ฅ ๋น ๋จ์ด์ ๊ธธ์ด๋ฅผ 400์ผ๋ก ์ ํ embedding_dims ํฉ์ฑ๊ณฑ ์ ๊ฒฝ๋ง์ ์
๋ ฅํ ํ ํฐ ๋ฒกํฐ์ ๊ธธ์ด
filters ํ๋ จ์ ์ฌ์ฉํ ํํฐ ๊ฐ์ kernel_size ๊ฐ 1์ฐจ์ ํํฐ์ ๋๋น hidden_dims ์ ๊ฒฝ๋ง ๋์ ์๋ ์๋ฐฉํฅ ๋ด๋ฐ ์
maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2
pad_trunc maxlen(400)๋งํผ ๊ธธ์ด ์กฐ์ keras์ pad_sequence ์ ๊ฐ์ ์ญํ ์ ํ๋ค.
def pad_trunc(data,maxlen):
new_data = []
zero_vector = []
for _ in range(len(data[0][0])):
zero_vector.append(0.0)
for sample in data:
if len(sample) > maxlen:
temp = sample[:maxlen]
elif len(sample) < maxlen:
temp = sample
additional_elems = maxlen - len(sample)
for _ in range(additional_elems):
temp.append(zero_vector)
else:
temp = sample
new_data.append(temp)
return new_data
x_train = pad_trunc(x_train,maxlen)
x_test = pad_trunc(x_test,maxlen)
x_train = np.reshape(x_train,(len(x_train),maxlen,embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test,(len(x_test),maxlen,embedding_dims))
y_test = np.array(y_test)
๋ชจ๋ธ ๋น๋
- Sequential ์ ๊ฒฝ๋ง ๊ตฌ์ถ ํ ์์ฑ
- Conv1D๋ 1์ฐจ์์ผ๋ก kernel_size * filter ๊ฐ๊ฒฉ 1๋งํผ ์งํ ํ padding = 'valid' ์
๋ ฅ๊ณผ ๋์ผ
- relu ์ฐ์ฐ
model = Sequential()
model.add(Conv1D(
filters,
kernel_size,
padding='valid',
activation='relu',
strides=1,
input_shape=(maxlen,embedding_dims)))
Max pool ์ต๋๊ฐ์ ๊ธฐ์ค์ผ๋ก ํน์ง ์ถ์ถ
model.add(GlobalMaxPool1D())
'๐ฃ๏ธ Natural Language Processing' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[BERT Dict] NSP(Next Senctenct Prediction) Task (0) | 2022.05.07 |
---|---|
๋ฒํคํ (bucketing)์ ์ด์ฉํ ํ์ต ๋ณต์ก๋ ํด๊ฒฐ (0) | 2021.03.28 |
[doc2vec] ๋ฌธ์ ์ ์ฌ๋ ์ถ์ (0) | 2021.03.09 |
[Word2vec] ๋จ์ด ๊ด๊ณ ์๊ฐํ (0) | 2021.03.08 |
Word2vec Vs GloVe (0) | 2021.03.08 |
ai.stanford.edu/~amaas/data/sentiment/
Sentiment Analysis
Publications Using the Dataset Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (A
ai.stanford.edu
์คํ ํผ๋ ๋ํ๊ต ์ธ๊ณต์ง๋ฅ ์ฐ๊ตฌํ์ ์๋ณธ ์๋ฃ๋ก classification ์งํ
import glob
import os
from random import shuffle
Data load
def preprocess_data(filepath):
positivie_path = os.path.join(filepath,'pos')
negative_path = os.path.join(filepath,'neg')
pos_label = 1
neg_label = 0
dataset = []
for filename in glob.glob(os.path.join(positivie_path,"*.txt")):
with open(filename,'r') as f:
dataset.append((pos_label, f.read()))
for filename in glob.glob(os.path.join(negative_path,"*.txt")):
with open(filename,'r') as f:
dataset.append((neg_label, f.read()))
shuffle(dataset)
return dataset
dataset = preprocess_data('๊ฐ์ธํ๊ฒฝ/aclimdb/train')
wordvectorize
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data
nltk์ TreebankwordTokenizer๋ฅผ ํตํด ํ ํฐํ ์ฌ์ฉ dataset์ ๋ฌธ์ฅ์ ํ ํฐํ ์งํ
def tokenize_and_vectorize(dataset):
tokenizer = TreebankWordTokenizer()
vectorized_data = []
for sample in dataset:
tokens = tokenizer.tokenize(sample[1])
sample_vecs = []
for token in tokens:
try:
sample_vecs.append(word_vectors[token])
except KeyError:
pass
vectorized_data.append(sample_vecs)
return vectorized_data
vectorized_data = tokenize_and_vectorize(dataset)
collect_expected ์์ sentiment label ๊ฐ ์ถ์ถ
def collect_expected(dataset):
expected = []
for sample in dataset:
expected.append(sample[0])
return expected
expected = collect_expected(dataset)
train test ๋น์จ 8 : 2
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = vectorized_data[split_point:]
maxlen ๋ฌธ์ฅ ๋น ๋จ์ด์ ๊ธธ์ด๋ฅผ 400์ผ๋ก ์ ํ embedding_dims ํฉ์ฑ๊ณฑ ์ ๊ฒฝ๋ง์ ์
๋ ฅํ ํ ํฐ ๋ฒกํฐ์ ๊ธธ์ด
filters ํ๋ จ์ ์ฌ์ฉํ ํํฐ ๊ฐ์ kernel_size ๊ฐ 1์ฐจ์ ํํฐ์ ๋๋น hidden_dims ์ ๊ฒฝ๋ง ๋์ ์๋ ์๋ฐฉํฅ ๋ด๋ฐ ์
maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2
pad_trunc maxlen(400)๋งํผ ๊ธธ์ด ์กฐ์ keras์ pad_sequence ์ ๊ฐ์ ์ญํ ์ ํ๋ค.
def pad_trunc(data,maxlen):
new_data = []
zero_vector = []
for _ in range(len(data[0][0])):
zero_vector.append(0.0)
for sample in data:
if len(sample) > maxlen:
temp = sample[:maxlen]
elif len(sample) < maxlen:
temp = sample
additional_elems = maxlen - len(sample)
for _ in range(additional_elems):
temp.append(zero_vector)
else:
temp = sample
new_data.append(temp)
return new_data
x_train = pad_trunc(x_train,maxlen)
x_test = pad_trunc(x_test,maxlen)
x_train = np.reshape(x_train,(len(x_train),maxlen,embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test,(len(x_test),maxlen,embedding_dims))
y_test = np.array(y_test)
๋ชจ๋ธ ๋น๋
- Sequential ์ ๊ฒฝ๋ง ๊ตฌ์ถ ํ ์์ฑ
- Conv1D๋ 1์ฐจ์์ผ๋ก kernel_size * filter ๊ฐ๊ฒฉ 1๋งํผ ์งํ ํ padding = 'valid' ์
๋ ฅ๊ณผ ๋์ผ
- relu ์ฐ์ฐ
model = Sequential()
model.add(Conv1D(
filters,
kernel_size,
padding='valid',
activation='relu',
strides=1,
input_shape=(maxlen,embedding_dims)))
Max pool ์ต๋๊ฐ์ ๊ธฐ์ค์ผ๋ก ํน์ง ์ถ์ถ
model.add(GlobalMaxPool1D())
'๐ฃ๏ธ Natural Language Processing' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[BERT Dict] NSP(Next Senctenct Prediction) Task (0) | 2022.05.07 |
---|---|
๋ฒํคํ (bucketing)์ ์ด์ฉํ ํ์ต ๋ณต์ก๋ ํด๊ฒฐ (0) | 2021.03.28 |
[doc2vec] ๋ฌธ์ ์ ์ฌ๋ ์ถ์ (0) | 2021.03.09 |
[Word2vec] ๋จ์ด ๊ด๊ณ ์๊ฐํ (0) | 2021.03.08 |
Word2vec Vs GloVe (0) | 2021.03.08 |