728x90

ai.stanford.edu/~amaas/data/sentiment/

 

Sentiment Analysis

Publications Using the Dataset Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (A

ai.stanford.edu

 ์Šคํƒ ํผ๋“œ ๋Œ€ํ•™๊ต ์ธ๊ณต์ง€๋Šฅ ์—ฐ๊ตฌํŒ€์˜ ์›๋ณธ ์ž๋ฃŒ๋กœ classification ์ง„ํ–‰

import glob
import os
from random import shuffle

Data load

def preprocess_data(filepath):
    positivie_path = os.path.join(filepath,'pos')
    negative_path = os.path.join(filepath,'neg')

    pos_label = 1
    neg_label = 0
    dataset = []

    for filename in glob.glob(os.path.join(positivie_path,"*.txt")):
        with open(filename,'r') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path,"*.txt")):
        with open(filename,'r') as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)

    return dataset

dataset = preprocess_data('๊ฐœ์ธํ™˜๊ฒฝ/aclimdb/train')

wordvectorize

from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
from nlpia.loaders import get_data

nltk์˜ TreebankwordTokenizer๋ฅผ ํ†ตํ•ด ํ† ํฐํ™” ์‚ฌ์šฉ dataset์˜ ๋ฌธ์žฅ์„ ํ† ํฐํ™” ์ง„ํ–‰

def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)

    return vectorized_data
    
    
vectorized_data = tokenize_and_vectorize(dataset)

collect_expected ์˜ˆ์ƒ sentiment  label ๊ฐ’ ์ถ”์ถœ 

def collect_expected(dataset):
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

expected = collect_expected(dataset)

train test ๋น„์œจ 8 : 2 

split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = vectorized_data[split_point:]

maxlen ๋ฌธ์žฅ ๋‹น ๋‹จ์–ด์˜ ๊ธธ์ด๋ฅผ 400์œผ๋กœ ์ œํ•œ embedding_dims  ํ•ฉ์„ฑ๊ณฑ ์‹ ๊ฒฝ๋ง์— ์ž…๋ ฅํ•  ํ† ํฐ ๋ฒกํ„ฐ์˜ ๊ธธ์ด 
filters ํ›ˆ๋ จ์— ์‚ฌ์šฉํ•  ํ•„ํ„ฐ ๊ฐœ์ˆ˜ kernel_size ๊ฐ 1์ฐจ์› ํ•„ํ„ฐ์˜ ๋„ˆ๋น„ hidden_dims ์‹ ๊ฒฝ๋ง ๋์— ์žˆ๋Š” ์ˆœ๋ฐฉํ–ฅ ๋‰ด๋Ÿฐ ์ˆ˜ 

maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

pad_trunc maxlen(400)๋งŒํผ ๊ธธ์ด ์กฐ์ • keras์˜ pad_sequence ์™€ ๊ฐ™์€ ์—ญํ• ์„ ํ•œ๋‹ค.

def pad_trunc(data,maxlen):
    new_data = []
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

x_train = pad_trunc(x_train,maxlen)
x_test = pad_trunc(x_test,maxlen)

x_train = np.reshape(x_train,(len(x_train),maxlen,embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test,(len(x_test),maxlen,embedding_dims))
y_test = np.array(y_test)

 ๋ชจ๋ธ ๋นŒ๋“œ
- Sequential ์‹ ๊ฒฝ๋ง ๊ตฌ์ถ• ํ‹€ ์ƒ์„ฑ 
- Conv1D๋Š” 1์ฐจ์›์œผ๋กœ kernel_size * filter ๊ฐ„๊ฒฉ 1๋งŒํผ ์ง„ํ–‰ ํ›„ padding = 'valid' ์ž…๋ ฅ๊ณผ ๋™์ผ
- relu ์—ฐ์‚ฐ

model = Sequential()

model.add(Conv1D(
          filters,
          kernel_size,
          padding='valid',
          activation='relu',
          strides=1,
          input_shape=(maxlen,embedding_dims)))

Max pool ์ตœ๋Œ“๊ฐ’์„ ๊ธฐ์ค€์œผ๋กœ ํŠน์ง• ์ถ”์ถœ 

model.add(GlobalMaxPool1D())
 
๋ฐ˜์‘ํ˜•
๋‹คํ–ˆ๋‹ค