nlpia๋ฅผ ํตํด ๋ฏธ๋ฆฌ ํ๋ จ๋ ๊ตฌ๊ธ ๋ด์ค word2vec ๋ชจํ์ ๋ฐ์์จ๋ค. ๋จ์ด ์ 30๋ง๊ฐ
import os
from nlpia.loaders import get_data
from gensim.models.word2vec import Word2VecKeyedVectors
wv = get_data('word2vec')
len(wv.vocab) #3000000
n-gram ๋จ์ด๋ค์ด '_' ๋ฌธ์๋ก ์ฐ๊ฒฐ๋ ๊ฒ์ ํ์ธ ํ ์ ์๋ค.
import pandas as pd
from tqdm import tqdm
vocab = pd.Series(wv.vocab)
vocab.iloc[1000000:1000006]
# Starwood_Hotels_HOT Vocab(count:2000000, index:1000000)
# Tammy_Kilborn Vocab(count:1999999, index:1000001)
# aortic_aneurism Vocab(count:1999998, index:1000002)
# Spragins_Hall Vocab(count:1999997, index:1000003)
# Ed_Iacobucci Vocab(count:1999996, index:1000004)
# Seilheimer Vocab(count:1999995, index:1000005)
'Seilheimer' ๋จ์ด ๋ฒกํฐ๋ ์๋ ์ ๊ฐ์ด ํ์ธ ํ ์์๋ค. 300๊ฐ์ ๋จ์ด ๋ฒกํฐ๋ฅผ ๋ณด์ฌ์ค๋ค.
wv['Seilheimer']
# array([ 0.10107422, 0.08935547, -0.15429688, -0.04956055, -0.00436401,
# -0.06079102, -0.07421875, -0.06005859, -0.06079102, -0.02758789,
# 0.03015137, -0.05712891, -0.07617188, 0.05786133, -0.05249023,
# .....dtype=float32)
wv['Seilheimer'].shape # (300,)
'Seilheimer' ๊ณผ 'Seil'์ ๊ฑฐ๋ฆฌ๋ฅผ ํ์ธํด ๋ณด์๋ค.
์ ํด๋ฆฌ๋ ๊ฑฐ๋ฆฌ, ์ฝ์ฌ์ธ ์ ์ฌ๋, ์ฝ์ฌ์ธ ๊ฑฐ๋ฆฌ ํ์ธ
2.5733225import numpy as np
np.linalg.norm(wv['Seilheimer'] - wv['Seil']) # 2.5733225
cos_similarity = np.dot(wv['Seilheimer'],wv['Seil']) / (np.linalg.norm(wv['Seilheimer'])*np.linalg.norm(wv['Seil']))
cos_similarity # 0.35953805
1 - cos_similarity # 0.6404619514942169
'Seilheimer' ๊ณผ 'Seil'์ ๊ฑฐ๋ฆฌ๋ ๋ณ ์ฐจ์ด๊ฐ ์์๋ค. 'Seilheimer' ๊ณผ 'person'์ ๋จ์ด ๊ฑฐ๋ฆฌ๋ฅผ ๋ณด๋ฉด 'Seilheimer'๋ผ๋ ๋จ์ด๊ฐ ์ฌ๋์ด๋ ๋๊ฒ ์ฐ๊ด๋์ด ์์์ ์ ์ ์๋ค.
np.linalg.norm(wv['Seilheimer'] - wv['person'])
cos_similarity = np.dot(wv['Seilheimer'],wv['person']) / (np.linalg.norm(wv['Seilheimer'])*np.linalg.norm(wv['person']))
cos_similarity # 0.039810065
1 - cos_similarity # 0.96
cities = get_data('cities')
cities.head(1).T
# geonameid 3039154
# name El Tarter
# asciiname El Tarter
# alternatenames Ehl Tarter,ะญะป ะขะฐััะตั
# latitude 42.5795
# longitude 1.65362
# feature_class P
# feature_code PPL
# country_code AD
# cc2 NaN
# admin1_code 02
# admin2_code NaN
# admin3_code NaN
# admin4_code NaN
# population 1052
# elevation NaN
# dem 1721
# timezone Europe/Andorra
# modification_date 2012-11-03
# Geocities์ ์๋ฃ๋ฅผ ํตํด ์ง๋์์ ์์น์ ๊ฐ ์ง๋ฐฉ๊ฐ์ ๊ฑฐ๋ฆฌ๋ฅผ ๋น๊ตํ๊ธฐ
cities = get_data('cities')
cities.head(1).T
# geonameid 3039154
# name El Tarter
# asciiname El Tarter
# alternatenames Ehl Tarter,ะญะป ะขะฐััะตั
# latitude 42.5795
# longitude 1.65362
# feature_class P
# feature_code PPL
# country_code AD
# cc2 NaN
# admin1_code 02
# admin2_code NaN
# admin3_code NaN
# admin4_code NaN
# population 1052
# elevation NaN
# dem 1721
# timezone Europe/Andorra
# modification_date 2012-11-03
ํ๊ตญ์ ์ง๋ฐฉ ์ ๋ณด๋ฅผ ๋ค ๊ฐ์ ธ์จ๋ค.
๊ทธ ํ st ์ง๋ฐฉ์ ๊ณ ์ ๋ฒํธ๋ฅผ ํตํด state ๋ฐ์ดํฐ์ mapping ์์ผ ๋ฐ์ดํฐ๋ฅผ ํตํฉ๋๋ค.
kr = cities[(cities.country_code == "KR") & (cities.admin1_code.notnull())].copy()
# name asciiname ... timezone modification_date
# geonameid ...
# 1832015 Heunghae Heunghae ... Asia/Seoul 2016-09-09
# 1832215 Yeonil Yeonil ... Asia/Seoul 2016-09-09
# 1832257 Neietsu Neietsu ... Asia/Seoul 2012-01-18
# 1832384 Eisen Eisen ... Asia/Seoul 2012-01-18
# 1832501 Reiko Reiko ... Asia/Seoul 2012-01-18
# ... ... ... ... ...
# 9887776 Dongmyeon Dongmyeon ... Asia/Seoul 2015-01-08
# 10913399 Yeonsan Yeonsan ... Asia/Seoul 2015-11-10
# 11124627 Oepo Oepo ... Asia/Seoul 2016-04-12
# 11523293 Sejong Sejong ... Asia/Seoul 2017-04-11
# 11549691 Bupyeong Bupyeong ... Asia/Seoul 2017-05-25
# [175 rows x 18 columns]
# kr = cities[(cities.country_code == "KR")]
# kr = cities[(cities.country_code == "KR") & (cities.admin1_code.notnull())].copy()
# kr
# Out[36]:
# name asciiname ... timezone modification_date
# geonameid ...
# 1832015 Heunghae Heunghae ... Asia/Seoul 2016-09-09
# 1832215 Yeonil Yeonil ... Asia/Seoul 2016-09-09
# 1832257 Neietsu Neietsu ... Asia/Seoul 2012-01-18
# 1832384 Eisen Eisen ... Asia/Seoul 2012-01-18
# 1832501 Reiko Reiko ... Asia/Seoul 2012-01-18
# ... ... ... ... ...
# 9887776 Dongmyeon Dongmyeon ... Asia/Seoul 2015-01-08
# 10913399 Yeonsan Yeonsan ... Asia/Seoul 2015-11-10
# 11124627 Oepo Oepo ... Asia/Seoul 2016-04-12
# 11523293 Sejong Sejong ... Asia/Seoul 2017-04-11
# 11549691 Bupyeong Bupyeong ... Asia/Seoul 2017-05-25
# [175 rows x 18 columns]
states = pd.read_csv('states.csv',header=True)
states
states.Abbreviation[0] = '01'
states.Abbreviation[1] = '03'
states.Abbreviation[2] = '05'
states.Abbreviation[3] = '06'
states
states = dict(zip(states.Abbreviation, states.State))
kr['city']= kr.name.copy()
kr['st'] = kr.admin1_code.copy()
kr['state'] = kr.st.map(states)
kr[kr.columns[-3:]].head()
vocab = pd.np.concatenate([kr.city,kr.st,kr.state])
vocab = np.array([word for word in vocab if word in wv.wv])
vocab[:5] #array(['Eisen', 'Reiko', 'Eisen', 'Yeoncheon', 'Yeoju'], dtype='<U9')
# ์ฃผ ๋จ์ด ๋ฒกํฐ๋ก ๋์ ๋จ์ด ๋ฒกํฐ ์ฆ๊ฐ
ํ๊ตญ์๋ ๊ด์ฃผ๊ด์ญ์์ ๊ฒฝ๊ธฐ๋ ๊ด์ฃผ์๊ฐ ์๋ค. ์ด์ฒ๋ผ ๊ทธ ๋์๊ฐ ์ํ ์ฃผ๋ฅผ ์ฐ๊ฒฐํ๋ ๊ฒ์ด ๋์ ๋ฒกํฐ๋ฅผ ํด๋น ์ฃผ ๋ฒกํฐ ์ ๋ณด๋ก 'augmentation' ํ๋ค๊ณ ํ๋ค.
city_plus_state = []
for c, state, st in zip(kr.city, kr.state, kr.st):
if c not in vocab:
continue
row = []
if state in vocab:
try:
row.extend(wv[c]+wv[state])
except KeyError:
continue
else:
try:
row.extend(wv[c]+wv[st])
except KeyError:
continue
city_plus_state.append(row)
kr_300D = pd.DataFrame(city_plus_state)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
kr_2D = pca.fit_transform(kr_300D.iloc[:,:300])
'๐ฃ๏ธ Natural Language Processing' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[CNN] ํฉ์ฑ๊ณฑ ์ ๊ฒฝ๋ง (feat. Learning Word Vectors for Sentiment Analysis) (0) | 2021.03.13 |
---|---|
[doc2vec] ๋ฌธ์ ์ ์ฌ๋ ์ถ์ (0) | 2021.03.09 |
Word2vec Vs GloVe (0) | 2021.03.08 |
BERT (Deep Bidirectional Transformers for Language Understanding) (0) | 2021.03.03 |
[Word2Vec] ๊ตฌํ ํจํค์ง ์ฌ์ฉ (4) | 2021.03.02 |