728x90
NSMC ๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ์ ๋ฌ๋ฆฐ ๋ณ์ ์ ๊ธ์ /๋ถ์ ์ผ๋ก ๋ณํํ binary-class ๋ฐ์ดํฐ ์
# kaggle-nsmc
import os
import zipfile
def extractall(path,s_path,info=None,f_type=None):
file_list = os.listdir(path)
for file in file_list:
try:
if file.split('.')[1] in "zip":
zipRef = zipfile.ZipFile(path + file, 'r')
zipRef.extractall(s_path) # ์์ถ ํ๊ธฐ
zipRef.close()
except IndexError:
continue
if info:
file_list=os.listdir(s_path)
for f in file_list:
if f_type in f and 'zip' not in f:
print(f.ljust(30) + str(round(os.path.getsize(s_path + f) / 1000000,2)) + 'MB')
FILE_PATH = 'C:/Users/admin/Desktop/nsmc/'
# Data ํด๋ ์์ฑ
DATA_FOLDER_NAME ='data/'
os.makedirs(FILE_PATH+DATA_FOLDER_NAME)
# ์์ถ ํด์ ํ data folder์ ์ ์ฅ
extractall(path=FILE_PATH,s_path=FILE_PATH+DATA_FOLDER_NAME,info=True,f_type='csv')
# ko_data.csv 0.62MB
file = os.listdir(FILE_PATH+DATA_FOLDER_NAME)
# ๋ฐ์ดํฐ ํ์ธ
for f in file:
print(f.ljust(30) + str(round(os.path.getsize(FILE_PATH + DATA_FOLDER_NAME + f) / 1000000,2)) + 'MB')
# ko_data.csv 0.62MB
# ratings.txt 19.52MB
# ratings_test.txt 4.89MB
# ratings_train.txt 14.63MB
Train_data ํ์ธ
import numpy as np
import pandas as pd
FILE = 'ratings_train.txt'
train_data = pd.read_csv(FILE_PATH+DATA_FOLDER_NAME+FILE,sep='\t',quoting=3)
train_data.columns # 'id', 'document', 'label'
train_data.head()
# id document label
# 0 9976970 ์ ๋๋น.. ์ง์ง ์ง์ฆ๋๋ค์ ๋ชฉ์๋ฆฌ 0
# 1 3819312 ํ ...ํฌ์คํฐ๋ณด๊ณ ์ด๋ฉ์ํ์ค....์ค๋ฒ์ฐ๊ธฐ์กฐ์ฐจ ๊ฐ๋ณ์ง ์๊ตฌ๋ 1
# 2 10265843 ๋๋ฌด์ฌ๋ฐ์๋ค๊ทธ๋์๋ณด๋๊ฒ์์ถ์ฒํ๋ค 0
# 3 9045019 ๊ต๋์ ์ด์ผ๊ธฐ๊ตฌ๋จผ ..์์งํ ์ฌ๋ฏธ๋ ์๋ค..ํ์ ์กฐ์ 0
# 4 6483659 ์ฌ์ด๋ชฌํ๊ทธ์ ์ต์ด์ค๋ฐ ์ฐ๊ธฐ๊ฐ ๋๋ณด์๋ ์ํ!์คํ์ด๋๋งจ์์ ๋์ด๋ณด์ด๊ธฐ๋ง ํ๋ ์ปค์คํด ... 1
train_data.info()
# Data columns (total 3 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 id 150000 non-null int64
# 1 document 149995 non-null object
# 2 label 150000 non-null int64
# dtypes: int64(2), object(1)
# memory usage: 3.4+ MB
Train_data์ label ๋น์จ ํ์ธ
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.bar(train_data['label'].value_counts().index,train_data['label'].value_counts(),color=['red','blue'])
plt.title("Value Count")
plt.xlabel("Count 0 1 ")
๋ฌธ์ฅ ๋น ๋จ์ด ์ ๋น์จ ํ์ธ
sentence_word = train_data['document'].astype(str).apply(lambda x: x.split())
sentence_word_len = sentence_word.apply(lambda x: len(x))
plt.figure(figsize=(10,5))
plt.hist(sentence_word_len,bins=10,log=True)
plt.xlabel('word count num')
plt.ylabel('Sentence')
* Preprocessing
# ํน์ ๋ฌธ์ ์ ๊ฑฐ
# preprocessing
# ํน์ ๋ฌธ์๋ ๊ธฐํธ ์ ๊ฑฐ
import re
rm = re.compile('[:;\'\"\[\]\(\)\.,@]')
rm_train = train_data['document'].astype(str).apply(lambda x: re.sub(rm,'',x))
# ๋ถ์ฉ์ด ์ ๊ฑฐ
# ํ ํฐํ + ๋ถ์ฉ์ด ์ ๊ฑฐ
# https://bab2min.tistory.com/544 ์ฌ์ฉ
import numpy as np
stopword_100 = pd.read_csv('ํ๊ตญ์ด๋ถ์ฉ์ด100.txt',sep='\t',header=None)
stopword_list = stopword_100[0].to_numpy()
from konlpy.tag import Okt
okt=Okt()
okt_rm_train = rm_train.astype(str).apply(lambda x: str(okt.morphs(x,stem=True)))
# 0 [์, ๋๋น, ์ง์ง, ์ง์ฆ๋๋ค, ๋ชฉ์๋ฆฌ]
# 1 [ํ , ํฌ์คํฐ, ๋ณด๊ณ , ์ด๋ฉ, ์ํ, ์ค, ์ค๋ฒ, ์ฐ๊ธฐ, ์กฐ์ฐจ, ๊ฐ๋ณ๋ค, ์๋ค]
# 2 [๋, ๋ฌด์ฌ, ๋ฐ์, ๋ค๊ทธ, ๋์, ๋ณด๋ค, ์ถ์ฒ, ํ, ๋ค]
# 3 [๊ต๋์, ์ด์ผ๊ธฐ, ๊ตฌ๋จผ, ์์งํ๋ค, ์ฌ๋ฏธ, ๋, ์๋ค, ํ์ , ์กฐ์ ]
# 4 [์ฌ์ด, ๋ชฌํ, ๊ทธ, ์, ์ต์ด์ค๋ฝ๋ค, ์ฐ๊ธฐ, ๊ฐ, ๋๋ณด์ด๋ค, ์ํ, !, ์คํ์ด๋...
# 5 [๋ง, ๊ฑธ์, ๋ง, ๋ผ๋ค, 3, ์ธ, ๋ถํฐ, ์ด๋ฑํ๊ต, 1, ํ๋
, ์์ธ, 8, ์ด...
# 6 [์์, ์, ๊ธด์ฅ๊ฐ, ์, ์ ๋๋ก, ์ด๋ฆฌ๋ค, ํ๋ค]
# 7 [๋ณ, ๋ฐ๊ฐ, ๋, ์๊น๋ค, ์, ๋์ค๋ค, ์ด์๊ฒฝ, ๊ธธ์ฉ์ฐ, ์ฐ, ๊ธฐ, ์ํ, ์ด,...
# 8 [์ก์
, ์ด, ์๋ค, ์ฌ๋ฏธ, ์๋ค, ๋ช, ์๋๋ค, ์ํ]
# 9 [์์ผ, ํ์ , ์ด, ๋ฎ๋ค, ?, ๊ฝค, ๋ณผ, ๋ง, ํ, ๋ฐ, ํ๋ฆฌ์ฐ๋, ์, ํ๋ คํ...
# Name: document, dtype: object
len(okt_rm_train)
okt_rm_train = okt_rm_train.apply(lambda x: [str(word) for word in x])
rmstop_okt_rm_train = []
for word in okt_rm_train:
if word not in stopword_list:
rmstop_okt_rm_train.append(word)
else:
rmstop_okt_rm_train.append([])
๋ฐ์ํ
'๐ฃ๏ธ Natural Language Processing' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
์ฑ ๋ด ๋ง๋ค๊ธฐ(1) (0) | 2021.02.13 |
---|---|
MaLSTM (0) | 2021.02.13 |
PCA, SVD ์ ์ฌ ์๋ฏธ ๋ถ์ (0) | 2021.02.11 |
CNN ํ ์คํธ ์ ์ฌ๋ ๋ถ์(Feat. Quora pairs) (0) | 2021.02.10 |
KoNLPy ์ข ๋ฅ (0) | 2021.02.06 |