728x90

# ์ „์ฒ˜๋ฆฌ ํ•จ์ˆ˜ ์ƒ์„ฑ ํ›„ ์ ์šฉ

def preprocessing(data,stopword):
    rm = re.compile('[:;\\'\\"\\[\\]\\(\\)\\.,@]')
    rm_data = data.astype(str).apply(lambda x: re.sub(rm, '', x))

    word_token = [word_tokenize(x) for x in rm_data]
    remove_stopwords_tokens = []
    for sentence in word_token:
        temp = []
        for word in sentence:
            if word not in stopword:
                temp.append(word)
        remove_stopwords_tokens.append(temp)
    return remove_stopwords_tokens
test_data = train_data = pd.read_csv('data/ko_data.csv',encoding='cp949')

train_remove_stopword_tokens = preprocessing(train_data['document'],stopword_list)
test_remove_stopwords_tokens=preprocessing(test_data['Sentence'],stopword_list)
len(test_remove_stopwords_tokens)

# ๋‹จ์–ด ํ† ํฐํ™”  ํ›„ ๋‹จ์–ด ์‚ฌ์ „ ์ƒ์„ฑ(word_vocab)

import numpy as np
import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_remove_stopword_tokens)
train_sequences = tokenizer.texts_to_sequences(train_remove_stopword_tokens)
test_sequences = tokenizer.texts_to_sequences(test_remove_stopwords_tokens)

word_vocab = tokenizer.word_index

# train test data pad ๊ฐ€์žฅ ๊ธด ๋ฌธ์žฅ์„ ๊ธฐ์ค€์œผ๋กœ ๋‚˜๋จธ์ง€ ๋ฌธ์žฅ 0์œผ๋กœ ์ฑ„์›Œ ๋„ฃ์Œ
# ๋ฐ์ดํ„ฐ ์ €์žฅ 

len(max(train_sequences))
MAX_SEQUENCE_LENGTH = len(max(train_sequences))
train_inputs = pad_sequences(train_sequences,maxlen=MAX_SEQUENCE_LENGTH,padding='post')
train_labels = np.array(train_data['label'])
test_inputs = pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH,padding='post')
test_labels = np.array(test_data['label'])


np.save(open('nsmc_train_input.npy','wb'),train_inputs)
np.save(open('nsmc_test_input.npy','wb'),test_inputs)

np.save(open('nsmc_train_label.npy','wb'),train_labels)
np.save(open('nsmc_test_label.npy','wb'),test_labels)

data_configs = {}
data_configs['vocab_size'] = len(word_vocab)+1
data_configs['vocab'] = word_vocab

import json
json.dump(data_configs,open('data_configs.json','w'),ensure_ascii=False)

# ๋ชจ๋ธ ๋ณ€์ˆ˜ 
 batch_size: ํ•œ๋ฒˆ์— ๋„ฃ์„ ๋ฐ์ดํ„ฐ ์–‘
 num_epochs: ์ˆœํ™˜ ์ˆ˜
 valid_split: ๊ฒ€์ฆ ๋ฐ์ดํ„ฐ ๋น„์œจ
 max_len: ๋ฌธ์žฅ ๊ธธ์ด

import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import json

model_name = 'rnn_classifier_kr'
BATCH_SIZE = 128
NUM_EPOCHS = 8
VALID_SPLIT = 0.1
MAX_LEN = train_inputs.shape[1]

kargs = {
    'model_name':model_name,
    'vocab_size':data_configs['vocab_size'],
    'embedding_dimension':100,
    'dropout_rate':0.2,
    'lstm_dimension':150,
    'dense_dimension':150,
    'output_dimension':1
}

# LSTM ๋ถ„๋ฅ˜ ๋ชจ๋ธ

class LSTMclassifier(tf.keras.Model):
    def __init__(self,**kargs):
        super(RNNClassifier,self).__init__(name=kargs['model_name'])
        self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],output_dim=kargs['embedding_dimension'])
        self.lstm_1_layer = layers.LSTM(kargs['lstm_dimension'],return_sequences=True)
        self.lstm_2_layer = layers.LSTM(kargs['lstm_dimension'])
        self.dropout = layers.Dropout(kargs['dropout_rate'])
        self.fc1 = layers.Dense(units=kargs['dense_dimension'],activation=tf.keras.activations.tanh)
        #self.fc2 = layers.Dense(units=kargs['output_dimension'],activation=tf.keras.activations.sigmoid)
        self.fc2 = layers.Dense(units=kargs['output_dimension'], activation=tf.keras.activations.relu)

    def call(self,x):
        x = self.embedding(x)
        #print('embeding')
        x = self.dropout(x)
        #print('dropout')
        x = self.lstm_1_layer(x)
        #print('lstm1')
        x = self.lstm_2_layer(x)
        #print('lstm2')
        x = self.dropout(x)
        #print('dropout')
        x = self.fc1(x)
        #print('fc1')
        x = self.dropout(x)
        #print('dropout')
        x = self.fc2(x)
        #print('fc2')
        return x

# model ์„ ์–ธ
# ์ง„ํ–‰ ์ƒํ™ฉ earlystop_callback, cp_callback์œผ๋กœ ํ™•์ธ 

model = LSTMclassifier(**kargs)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
earlystop_callback = EarlyStopping(monitor='val_accuracy',min_delta=0.0001,patience=2)

checkpoing_path = 'weights.h5'

cp_callback = ModelCheckpoint(
    checkpoing_path, monitor='val_accuracy',verbose=1,save_best_only=True,save_weights_only=True
)

history = model.fit(train_inputs,train_labels,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,
                    validation_split=VALID_SPLIT,callbacks=[earlystop_callback,cp_callback])

# ์ œ์ถœ

predictions = model.predict(test_inputs, batch_size=128)
predictions = predictions.squeeze(-1)
test_id = test_data['Id']

output = pd.DataFrame(data={"Id":list(test_id),'Predicted':list(predict)})

output.to_csv('lstm_predict_p_relu2.csv',index=False,quoting=3)


import matplotlib.pyplot as plt

plt.hist(predictions)
# kaggle competitions submit -c korean-sa-competition-bdc101 -f lstm_predict_relu.csv -m "lstm_relu"

๋ฐ˜์‘ํ˜•
๋‹คํ–ˆ๋‹ค