Attention
$$Attention(Q, K, V) = \frac{Softmax(Q.K^T)}{\sqrt{d_k}}.V$$
Implementing Attention in Pytorch (Credit to StatQuest and Deeplearning.ai)
import torch
import torch.nn as nn
import torch.nn.Functional as F
class Attention(nn.Module):
def __init__(self, embedding_size=2, context_width=2):
self.W_q = nn.Linear(in_feature=embedding_size, out_feature=embedding_size, bias=False)
self.W_k = nn.Linear(in_feature=embedding_size, out_feature=embedding_size, bias=False)
self.W_v = nn.Linear(in_feature=embedding_size, out_feature=embedding_size, bias=False)
self.embedding_size = embedding_size
self.context_width = context_width
def forward(self, token_encodings):
q = self.W_q(token_embeddings)
k = self.W_k(token_embeddings)
v = self.W_v(token_embeddings)
attention = torch.matmul(F.softmax(torch.matmul(q,k.transpose()))/self.embedding_size**.5, v)
return attention