728x90

 nlpia๋ฅผ ํ†ตํ•ด ๋ฏธ๋ฆฌ ํ›ˆ๋ จ๋œ ๊ตฌ๊ธ€ ๋‰ด์Šค word2vec ๋ชจํ˜•์„ ๋ฐ›์•„์˜จ๋‹ค. ๋‹จ์–ด ์ˆ˜ 30๋งŒ๊ฐœ

import os
from nlpia.loaders import get_data
from gensim.models.word2vec import Word2VecKeyedVectors
wv = get_data('word2vec')
len(wv.vocab) #3000000
 

 

 n-gram ๋‹จ์–ด๋“ค์ด '_' ๋ฌธ์ž๋กœ ์—ฐ๊ฒฐ๋œ ๊ฒƒ์„ ํ™•์ธ ํ•  ์ˆ˜ ์žˆ๋‹ค.

import pandas as pd
from tqdm import tqdm
vocab = pd.Series(wv.vocab)
vocab.iloc[1000000:1000006]
# Starwood_Hotels_HOT    Vocab(count:2000000, index:1000000)
# Tammy_Kilborn          Vocab(count:1999999, index:1000001)
# aortic_aneurism        Vocab(count:1999998, index:1000002)
# Spragins_Hall          Vocab(count:1999997, index:1000003)
# Ed_Iacobucci           Vocab(count:1999996, index:1000004)
# Seilheimer             Vocab(count:1999995, index:1000005)

'Seilheimer' ๋‹จ์–ด ๋ฒกํ„ฐ๋Š” ์•„๋ž˜ ์™€ ๊ฐ™์ด ํ™•์ธ ํ•  ์ˆ˜์žˆ๋‹ค. 300๊ฐœ์˜ ๋‹จ์–ด ๋ฒกํ„ฐ๋ฅผ ๋ณด์—ฌ์ค€๋‹ค.

wv['Seilheimer']
# array([ 0.10107422,  0.08935547, -0.15429688, -0.04956055, -0.00436401,
#        -0.06079102, -0.07421875, -0.06005859, -0.06079102, -0.02758789,
#         0.03015137, -0.05712891, -0.07617188,  0.05786133, -0.05249023,
#       .....dtype=float32)
wv['Seilheimer'].shape # (300,)

'Seilheimer' ๊ณผ 'Seil'์˜ ๊ฑฐ๋ฆฌ๋ฅผ ํ™•์ธํ•ด ๋ณด์•˜๋‹ค.

์œ ํด๋ฆฌ๋“œ ๊ฑฐ๋ฆฌ, ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„, ์ฝ”์‚ฌ์ธ ๊ฑฐ๋ฆฌ ํ™•์ธ

2.5733225import numpy as np
np.linalg.norm(wv['Seilheimer'] - wv['Seil']) # 2.5733225

cos_similarity = np.dot(wv['Seilheimer'],wv['Seil']) / (np.linalg.norm(wv['Seilheimer'])*np.linalg.norm(wv['Seil']))
cos_similarity # 0.35953805

1 - cos_similarity # 0.6404619514942169

'Seilheimer' ๊ณผ 'Seil'์˜ ๊ฑฐ๋ฆฌ๋Š” ๋ณ„ ์ฐจ์ด๊ฐ€ ์—†์—ˆ๋‹ค. 'Seilheimer' ๊ณผ 'person'์˜ ๋‹จ์–ด ๊ฑฐ๋ฆฌ๋ฅผ ๋ณด๋ฉด 'Seilheimer'๋ผ๋Š” ๋‹จ์–ด๊ฐ€ ์‚ฌ๋žŒ์ด๋ž‘ ๋†’๊ฒŒ ์—ฐ๊ด€๋˜์–ด ์žˆ์Œ์„ ์•Œ ์ˆ˜ ์žˆ๋‹ค. 

 

np.linalg.norm(wv['Seilheimer'] - wv['person'])

cos_similarity = np.dot(wv['Seilheimer'],wv['person']) / (np.linalg.norm(wv['Seilheimer'])*np.linalg.norm(wv['person']))
cos_similarity # 0.039810065

1 - cos_similarity # 0.96
cities = get_data('cities')
cities.head(1).T
# geonameid                       3039154
# name                          El Tarter
# asciiname                     El Tarter
# alternatenames     Ehl Tarter,ะญะป ะขะฐั€ั‚ะตั€
# latitude                        42.5795
# longitude                       1.65362
# feature_class                         P
# feature_code                        PPL
# country_code                         AD
# cc2                                 NaN
# admin1_code                          02
# admin2_code                         NaN
# admin3_code                         NaN
# admin4_code                         NaN
# population                         1052
# elevation                           NaN
# dem                                1721
# timezone                 Europe/Andorra
# modification_date            2012-11-03

# Geocities์˜ ์ž๋ฃŒ๋ฅผ ํ†ตํ•ด ์ง€๋„์œ„์˜ ์œ„์น˜์™€ ๊ฐ ์ง€๋ฐฉ๊ฐ„์˜ ๊ฑฐ๋ฆฌ๋ฅผ ๋น„๊ตํ•˜๊ธฐ

cities = get_data('cities')
cities.head(1).T
# geonameid                       3039154
# name                          El Tarter
# asciiname                     El Tarter
# alternatenames     Ehl Tarter,ะญะป ะขะฐั€ั‚ะตั€
# latitude                        42.5795
# longitude                       1.65362
# feature_class                         P
# feature_code                        PPL
# country_code                         AD
# cc2                                 NaN
# admin1_code                          02
# admin2_code                         NaN
# admin3_code                         NaN
# admin4_code                         NaN
# population                         1052
# elevation                           NaN
# dem                                1721
# timezone                 Europe/Andorra
# modification_date            2012-11-03

ํ•œ๊ตญ์˜ ์ง€๋ฐฉ ์ •๋ณด๋ฅผ ๋‹ค ๊ฐ€์ ธ์˜จ๋‹ค.

๊ทธ ํ›„ st ์ง€๋ฐฉ์˜ ๊ณ ์œ  ๋ฒˆํ˜ธ๋ฅผ ํ†ตํ•ด state ๋ฐ์ดํ„ฐ์™€ mapping ์‹œ์ผœ ๋ฐ์ดํ„ฐ๋ฅผ ํ†ตํ•ฉ๋‹ˆ๋‹ค. 

kr = cities[(cities.country_code == "KR") & (cities.admin1_code.notnull())].copy()
#                 name  asciiname  ...    timezone  modification_date
# geonameid                        ...
# 1832015     Heunghae   Heunghae  ...  Asia/Seoul         2016-09-09
# 1832215       Yeonil     Yeonil  ...  Asia/Seoul         2016-09-09
# 1832257      Neietsu    Neietsu  ...  Asia/Seoul         2012-01-18
# 1832384        Eisen      Eisen  ...  Asia/Seoul         2012-01-18
# 1832501        Reiko      Reiko  ...  Asia/Seoul         2012-01-18
#               ...        ...  ...         ...                ...
# 9887776    Dongmyeon  Dongmyeon  ...  Asia/Seoul         2015-01-08
# 10913399     Yeonsan    Yeonsan  ...  Asia/Seoul         2015-11-10
# 11124627        Oepo       Oepo  ...  Asia/Seoul         2016-04-12
# 11523293      Sejong     Sejong  ...  Asia/Seoul         2017-04-11
# 11549691    Bupyeong   Bupyeong  ...  Asia/Seoul         2017-05-25
# [175 rows x 18 columns]
# kr = cities[(cities.country_code == "KR")]
# kr = cities[(cities.country_code == "KR") & (cities.admin1_code.notnull())].copy()
# kr
# Out[36]:
#                 name  asciiname  ...    timezone  modification_date
# geonameid                        ...
# 1832015     Heunghae   Heunghae  ...  Asia/Seoul         2016-09-09
# 1832215       Yeonil     Yeonil  ...  Asia/Seoul         2016-09-09
# 1832257      Neietsu    Neietsu  ...  Asia/Seoul         2012-01-18
# 1832384        Eisen      Eisen  ...  Asia/Seoul         2012-01-18
# 1832501        Reiko      Reiko  ...  Asia/Seoul         2012-01-18
#               ...        ...  ...         ...                ...
# 9887776    Dongmyeon  Dongmyeon  ...  Asia/Seoul         2015-01-08
# 10913399     Yeonsan    Yeonsan  ...  Asia/Seoul         2015-11-10
# 11124627        Oepo       Oepo  ...  Asia/Seoul         2016-04-12
# 11523293      Sejong     Sejong  ...  Asia/Seoul         2017-04-11
# 11549691    Bupyeong   Bupyeong  ...  Asia/Seoul         2017-05-25
# [175 rows x 18 columns]
states = pd.read_csv('states.csv',header=True)
states
states.Abbreviation[0] = '01'
states.Abbreviation[1] = '03'
states.Abbreviation[2] = '05'
states.Abbreviation[3] = '06'
states

states = dict(zip(states.Abbreviation, states.State))

kr['city']= kr.name.copy()
kr['st'] = kr.admin1_code.copy()
kr['state'] = kr.st.map(states)
kr[kr.columns[-3:]].head()

vocab = pd.np.concatenate([kr.city,kr.st,kr.state])
vocab = np.array([word for word in vocab if word in wv.wv])
vocab[:5] #array(['Eisen', 'Reiko', 'Eisen', 'Yeoncheon', 'Yeoju'], dtype='<U9')

 

# ์ฃผ ๋‹จ์–ด ๋ฒกํ„ฐ๋กœ ๋„์‹œ ๋‹จ์–ด ๋ฒกํ„ฐ ์ฆ๊ฐ•

ํ•œ๊ตญ์—๋Š” ๊ด‘์ฃผ๊ด‘์—ญ์‹œ์™€ ๊ฒฝ๊ธฐ๋„ ๊ด‘์ฃผ์‹œ๊ฐ€ ์žˆ๋‹ค. ์ด์ฒ˜๋Ÿผ ๊ทธ ๋„์‹œ๊ฐ€ ์†ํ•œ ์ฃผ๋ฅผ ์—ฐ๊ฒฐํ•˜๋Š” ๊ฒƒ์ด ๋„์‹œ ๋ฒกํ„ฐ๋ฅผ ํ•ด๋‹น ์ฃผ ๋ฒกํ„ฐ ์ •๋ณด๋กœ 'augmentation' ํ•œ๋‹ค๊ณ  ํ•œ๋‹ค.

city_plus_state = []
for c, state, st in zip(kr.city, kr.state, kr.st):
    if c not in vocab:
        continue
    row = []
    if state in vocab:
        try:
            row.extend(wv[c]+wv[state])
        except KeyError:
            continue
    else:
        try:
            row.extend(wv[c]+wv[st])
        except KeyError:
            continue
    city_plus_state.append(row)
kr_300D = pd.DataFrame(city_plus_state)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
kr_2D = pca.fit_transform(kr_300D.iloc[:,:300])

๋ฐ˜์‘ํ˜•
๋‹คํ–ˆ๋‹ค