728x90
nlp.seas.harvard.edu/2018/04/01/attention.html#position-wise-feed-forward-networks
class MultiHeadedAttention(nn.Module):
def __init__(self, h, d_model, dropout=0.1):
"Take in model size and number of heads."
super(MultiHeadedAttention, self).__init__()
assert d_model % h == 0
# We assume d_v always equals d_k
self.d_k = d_model // h
self.h = h
self.p = dropout
self.linears = clones(nn.Linear(d_model, d_model), 4)
self.attn = None
def forward(self, query, key, value, mask=None):
"Implements Figure 2"
if mask is not None:
# Same mask applied to all h heads.
mask = mask.unsqueeze(1)
nbatches = query.size(0)
# 1) Do all the linear projections in batch from d_model => h x d_k
query, key, value = \
[l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
for l, x in zip(self.linears, (query, key, value))]
# 2) Apply attention on all the projected vectors in batch.
x, self.attn = attention(query, key, value, mask=mask, dropout=self.p)
# 3) "Concat" using a view and apply a final linear.
x = x.transpose(1, 2).contiguous() \
.view(nbatches, -1, self.h * self.d_k)
return self.linears[-1](x)
Attention์ด ํ๋์ Sentence๊ฐ ์๋ ์ฌ๋ฌ ๋ฌธ์ฅ์ ์ ๋ณด๋ฅผ ์ค์ ์๊ฒ Scaled Dot-Product Attention์ ์ํํ Query, Key , Value ์ ๋ณด๋ฅผ ๋๊ธด๋ค. ๋๊ธฐ๋ ๊ณผ์ ์์ ํ์ผ๊ฐํ๋ ฌ(lower triangular matrix)๋ง์ด ์ ๋ณด์ด๊ณ ๋๋จธ์ง ์์ผ๊ฐํ๋ ฌ(upper triangular matrix)์ 0์ผ๋ก ์ฑ์์ง๋ค. ๋ฐ๋ผ์ ๋๋จธ์ง ๋ถํ์ํ ์๋ถ๋ถ์ ์์ฃผ์์ ์๋ก ์ฑ์ ๋ฃ์ด ํ์ Softmaxํจ์๋ฅผ ์ ์ฉํ๊ฒํด 0์ผ๋ก ๋ง๋ ๋ค.
๋ฐ์ํ
'๐ฃ๏ธ Natural Language Processing' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
์ ํ ํ๋ณ ๋ถ์ ( LDA ) (0) | 2021.02.23 |
---|---|
LSA ๊ฑฐ๋ฆฌ์ ์ ์ฌ๋ (0) | 2021.02.21 |
์ ์ฌ ๋ํด๋ ํ ๋น (LDiA, Latent Dirichlet Allocation) (0) | 2021.02.17 |
[Kaggle] ๋ค์ด๋ฒ ์ํ ๋ฆฌ๋ทฐ ๋ถ๋ฅ(2) (0) | 2021.02.17 |
์ฑ ๋ด ๋ง๋ค๊ธฐ(1) (0) | 2021.02.13 |