728x90
# ์ ์ฒ๋ฆฌ ํจ์ ์์ฑ ํ ์ ์ฉ
def preprocessing(data,stopword):
rm = re.compile('[:;\\'\\"\\[\\]\\(\\)\\.,@]')
rm_data = data.astype(str).apply(lambda x: re.sub(rm, '', x))
word_token = [word_tokenize(x) for x in rm_data]
remove_stopwords_tokens = []
for sentence in word_token:
temp = []
for word in sentence:
if word not in stopword:
temp.append(word)
remove_stopwords_tokens.append(temp)
return remove_stopwords_tokens
test_data = train_data = pd.read_csv('data/ko_data.csv',encoding='cp949')
train_remove_stopword_tokens = preprocessing(train_data['document'],stopword_list)
test_remove_stopwords_tokens=preprocessing(test_data['Sentence'],stopword_list)
len(test_remove_stopwords_tokens)
# ๋จ์ด ํ ํฐํ ํ ๋จ์ด ์ฌ์ ์์ฑ(word_vocab)
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_remove_stopword_tokens)
train_sequences = tokenizer.texts_to_sequences(train_remove_stopword_tokens)
test_sequences = tokenizer.texts_to_sequences(test_remove_stopwords_tokens)
word_vocab = tokenizer.word_index
# train test data pad ๊ฐ์ฅ ๊ธด ๋ฌธ์ฅ์ ๊ธฐ์ค์ผ๋ก ๋๋จธ์ง ๋ฌธ์ฅ 0์ผ๋ก ์ฑ์ ๋ฃ์
# ๋ฐ์ดํฐ ์ ์ฅ
len(max(train_sequences))
MAX_SEQUENCE_LENGTH = len(max(train_sequences))
train_inputs = pad_sequences(train_sequences,maxlen=MAX_SEQUENCE_LENGTH,padding='post')
train_labels = np.array(train_data['label'])
test_inputs = pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH,padding='post')
test_labels = np.array(test_data['label'])
np.save(open('nsmc_train_input.npy','wb'),train_inputs)
np.save(open('nsmc_test_input.npy','wb'),test_inputs)
np.save(open('nsmc_train_label.npy','wb'),train_labels)
np.save(open('nsmc_test_label.npy','wb'),test_labels)
data_configs = {}
data_configs['vocab_size'] = len(word_vocab)+1
data_configs['vocab'] = word_vocab
import json
json.dump(data_configs,open('data_configs.json','w'),ensure_ascii=False)
# ๋ชจ๋ธ ๋ณ์
batch_size: ํ๋ฒ์ ๋ฃ์ ๋ฐ์ดํฐ ์
num_epochs: ์ํ ์
valid_split: ๊ฒ์ฆ ๋ฐ์ดํฐ ๋น์จ
max_len: ๋ฌธ์ฅ ๊ธธ์ด
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import json
model_name = 'rnn_classifier_kr'
BATCH_SIZE = 128
NUM_EPOCHS = 8
VALID_SPLIT = 0.1
MAX_LEN = train_inputs.shape[1]
kargs = {
'model_name':model_name,
'vocab_size':data_configs['vocab_size'],
'embedding_dimension':100,
'dropout_rate':0.2,
'lstm_dimension':150,
'dense_dimension':150,
'output_dimension':1
}
# LSTM ๋ถ๋ฅ ๋ชจ๋ธ
class LSTMclassifier(tf.keras.Model):
def __init__(self,**kargs):
super(RNNClassifier,self).__init__(name=kargs['model_name'])
self.embedding = layers.Embedding(input_dim=kargs['vocab_size'],output_dim=kargs['embedding_dimension'])
self.lstm_1_layer = layers.LSTM(kargs['lstm_dimension'],return_sequences=True)
self.lstm_2_layer = layers.LSTM(kargs['lstm_dimension'])
self.dropout = layers.Dropout(kargs['dropout_rate'])
self.fc1 = layers.Dense(units=kargs['dense_dimension'],activation=tf.keras.activations.tanh)
#self.fc2 = layers.Dense(units=kargs['output_dimension'],activation=tf.keras.activations.sigmoid)
self.fc2 = layers.Dense(units=kargs['output_dimension'], activation=tf.keras.activations.relu)
def call(self,x):
x = self.embedding(x)
#print('embeding')
x = self.dropout(x)
#print('dropout')
x = self.lstm_1_layer(x)
#print('lstm1')
x = self.lstm_2_layer(x)
#print('lstm2')
x = self.dropout(x)
#print('dropout')
x = self.fc1(x)
#print('fc1')
x = self.dropout(x)
#print('dropout')
x = self.fc2(x)
#print('fc2')
return x
# model ์ ์ธ
# ์งํ ์ํฉ earlystop_callback, cp_callback์ผ๋ก ํ์ธ
model = LSTMclassifier(**kargs)
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
earlystop_callback = EarlyStopping(monitor='val_accuracy',min_delta=0.0001,patience=2)
checkpoing_path = 'weights.h5'
cp_callback = ModelCheckpoint(
checkpoing_path, monitor='val_accuracy',verbose=1,save_best_only=True,save_weights_only=True
)
history = model.fit(train_inputs,train_labels,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,
validation_split=VALID_SPLIT,callbacks=[earlystop_callback,cp_callback])
# ์ ์ถ
predictions = model.predict(test_inputs, batch_size=128)
predictions = predictions.squeeze(-1)
test_id = test_data['Id']
output = pd.DataFrame(data={"Id":list(test_id),'Predicted':list(predict)})
output.to_csv('lstm_predict_p_relu2.csv',index=False,quoting=3)
import matplotlib.pyplot as plt
plt.hist(predictions)
# kaggle competitions submit -c korean-sa-competition-bdc101 -f lstm_predict_relu.csv -m "lstm_relu"
๋ฐ์ํ
'๐ฃ๏ธ Natural Language Processing' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[Transformer] Multi-Head Attention (1) (0) | 2021.02.20 |
---|---|
์ ์ฌ ๋ํด๋ ํ ๋น (LDiA, Latent Dirichlet Allocation) (0) | 2021.02.17 |
์ฑ ๋ด ๋ง๋ค๊ธฐ(1) (0) | 2021.02.13 |
MaLSTM (0) | 2021.02.13 |
[Kaggle] ๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ ๋ถ๋ฅ(1) (0) | 2021.02.12 |