Attention
QKV Projections
\[\mathbf{Q} = \mathbf{X}\mathbf{W}_Q, \quad \mathbf{K} = \mathbf{X}\mathbf{W}_K, \quad \mathbf{V} = \mathbf{X}\mathbf{W}_V\]
Scaled Dot-Product Attention
\[\mathrm{Attn}(\mathbf{Q}, \mathbf{K}, \mathbf{V}) = \mathrm{softmax}\left(\frac{\mathbf{Q}\mathbf{K}^\top}{\sqrt{d_k}}\right)\mathbf{V}\]
\[\mathrm{CausalAttn}(\mathbf{Q}, \mathbf{K}, \mathbf{V}) = \mathrm{softmax}\left(\frac{\mathbf{Q}\mathbf{K}^\top + \mathbf{M}}{\sqrt{d_k}}\right)\mathbf{V}\]
\[\mathrm{head}_i = \mathrm{Attn}(\mathbf{Q}_i, \mathbf{K}_i, \mathbf{V}_i)\]
\[\mathrm{MHA}(\mathbf{X}) = \mathrm{Concat}(\mathrm{head}_1, \dots, \mathrm{head}_h)\mathbf{W}_O\]
FlashAttention
\[\mathbf{S}_i = \frac{\mathbf{Q}\mathbf{K}_i^\top}{\sqrt{d_k}} \quad (\text{block } i \text{ of } \mathbf{K}, \mathbf{V})\]
\[m_i = \max(m_{i-1}, \max_j \mathbf{S}_{i,j})\]
\[\ell_i = e^{m_{i-1}-m_i}\ell_{i-1} + \sum_j e^{\mathbf{S}_{i,j}-m_i}\]
\[\mathbf{O}_i = e^{m_{i-1}-m_i}\mathbf{O}_{i-1} + e^{\mathbf{S}_i - m_i}\mathbf{V}_i\]
\[\mathbf{O} = \mathbf{O}_T / \ell_T \quad (\text{after the last block } T\text{, full } N\times N \text{ matrix never materialized})\]
QK Normalization
\[\hat{\mathbf{q}}_i = \frac{\mathbf{q}_i}{\lVert \mathbf{q}_i \rVert_2}, \quad \hat{\mathbf{k}}_j = \frac{\mathbf{k}_j}{\lVert \mathbf{k}_j \rVert_2}\]
\[\mathrm{Attn}(\hat{\mathbf{Q}}, \hat{\mathbf{K}}, \mathbf{V}) = \mathrm{softmax}\left(\frac{\hat{\mathbf{Q}}\hat{\mathbf{K}}^\top}{\sqrt{d_k}}\right)\mathbf{V}\]
Differential Attention
\[\mathrm{DiffAttn}(\mathbf{X}) = \left(\mathrm{softmax}\left(\frac{\mathbf{Q}_1\mathbf{K}_1^\top}{\sqrt{d_k}}\right) - \lambda \, \mathrm{softmax}\left(\frac{\mathbf{Q}_2\mathbf{K}_2^\top}{\sqrt{d_k}}\right)\right)\mathbf{V}\]
Positional Encoding
RoPE
\[\theta_i = 10000^{-2i/d}\]
\[\begin{align}
\mathrm{RoPE}(x_{2i}, x_{2i+1}, m) =
\big(
x_{2i}\cos(m\theta_i) - x_{2i+1}\sin(m\theta_i), \\
x_{2i}\sin(m\theta_i) + x_{2i+1}\cos(m\theta_i)
\big)
\end{align}\]
YaRN
\[s = \frac{L'}{L}, \quad b' = b \cdot s^{\frac{d}{d-2}}\]
\[\theta_i' = (b')^{-2i/d}\]
ALiBi
\[\mathrm{ALiBi}(\mathbf{q}_i, \mathbf{k}_j) = \frac{\mathbf{q}_i \mathbf{k}_j^\top}{\sqrt{d_k}} - m \cdot (i - j)\]
NoPE
\[\mathbf{q}_i = \mathbf{x}_i \mathbf{W}_Q, \quad \mathbf{k}_j = \mathbf{x}_j \mathbf{W}_K \quad (\text{no positional term added})\]
KV Cache & Efficient Attention
KV Cache
\[\mathbf{K}_{1:t} = \mathrm{Concat}(\mathbf{K}_{1:t-1}, \mathbf{k}_t)\]
\[\mathbf{V}_{1:t} = \mathrm{Concat}(\mathbf{V}_{1:t-1}, \mathbf{v}_t)\]
\[\mathbf{o}_t = \mathrm{softmax}\left(\frac{\mathbf{q}_t \mathbf{K}_{1:t}^\top}{\sqrt{d_k}}\right)\mathbf{V}_{1:t}\]
MQA
\[\mathrm{head}_i = \mathrm{Attn}(\mathbf{Q}_i, \mathbf{K}, \mathbf{V}) \quad (\text{single } \mathbf{K}, \mathbf{V} \text{ shared across all heads})\]
GQA
\[\mathrm{head}_i = \mathrm{Attn}(\mathbf{Q}_i, \mathbf{K}_{\lceil i / g \rceil}, \mathbf{V}_{\lceil i / g \rceil})\]
MLA
\[\mathbf{c}_t = \mathbf{x}_t \mathbf{W}_{DKV} \in \mathbb{R}^{d_c}, \quad d_c \ll d\]
\[\mathbf{k}_t = \mathbf{c}_t \mathbf{W}_{UK}, \quad \mathbf{v}_t = \mathbf{c}_t \mathbf{W}_{UV} \quad (\text{cache only } \mathbf{c}_t)\]
YOCO
\[\mathbf{K}, \mathbf{V} = \mathrm{SelfDecoder}(\mathbf{X}) \quad (\text{computed once})\]
\[\mathbf{H}^{(l)} = \mathbf{H}^{(l-1)} + \mathrm{CrossAttn}(\mathbf{Q}^{(l)}, \mathbf{K}, \mathbf{V}) \quad (\text{reused by every cross-decoder layer})\]
SWA
\[\mathrm{SWA}(\mathbf{Q}, \mathbf{K}, \mathbf{V})_t = \mathrm{softmax}\left(\frac{\mathbf{q}_t \mathbf{K}_{t-w:t}^\top}{\sqrt{d_k}}\right)\mathbf{V}_{t-w:t}\]
NSA
\[\mathrm{NSA}(\mathbf{x}_t) = \sum_{c \,\in\, \{\mathrm{cmp},\, \mathrm{slc},\, \mathrm{win}\}} g_t^c \cdot \mathrm{Attn}(\mathbf{q}_t, \mathbf{K}_t^c, \mathbf{V}_t^c)\]
Infini-Attention
\[\mathbf{M}_s = \mathbf{M}_{s-1} + \phi(\mathbf{K}_s)^\top \mathbf{V}_s \quad (\text{compressive memory update})\]
\[\mathbf{o}_t = \alpha \cdot \mathrm{Attn}(\mathbf{q}_t, \mathbf{K}_s, \mathbf{V}_s) + (1-\alpha) \cdot \phi(\mathbf{q}_t)\mathbf{M}_{s-1}\]
\[\tilde{\mathbf{H}}^{(l)} = \mathbf{H}^{(l)} + \mathrm{MHA}(\mathrm{RMSNorm}(\mathbf{H}^{(l)}))\]
\[\mathbf{H}^{(l+1)} = \tilde{\mathbf{H}}^{(l)} + \mathrm{FFN}(\mathrm{RMSNorm}(\tilde{\mathbf{H}}^{(l)}))\]
MoE
\[g(\mathbf{x}) = \mathrm{TopK}(\mathrm{softmax}(\mathbf{x}\mathbf{W}_g))\]
\[\mathrm{MoE}(\mathbf{x}) = \sum_{i \in g(\mathbf{x})} g_i(\mathbf{x}) \, \mathrm{FFN}_i(\mathbf{x})\]
Load-Balancing Loss
\[\mathcal{L}_{\mathrm{aux}} = N \sum_{i=1}^{N} f_i \, P_i\]
Expert-Choice Routing
\[g(\mathbf{X}) = \mathrm{TopK}_{\text{tokens}}\!\left(\mathrm{softmax}(\mathbf{X}\mathbf{W}_g)^\top\right)\]
Shared Experts
\[\mathrm{MoE}(\mathbf{x}) = \sum_{j=1}^{N_s} \mathrm{FFN}_j^{\mathrm{shared}}(\mathbf{x}) + \sum_{i \,\in\, g(\mathbf{x})} g_i(\mathbf{x}) \, \mathrm{FFN}_i^{\mathrm{routed}}(\mathbf{x})\]
Mixture-of-Depths
\[r_t = \sigma(\mathbf{h}_t \mathbf{w}_r)\]
\[\mathbf{h}_t^{(l+1)} =
\begin{cases}
\mathbf{h}_t^{(l)} + r_t \cdot f(\mathbf{h}_t^{(l)}) & t \in \mathrm{TopK}(r) \\
\mathbf{h}_t^{(l)} & \text{otherwise}
\end{cases}\]
Decoding
Speculative Decoding
\[p_{\mathrm{accept}}(x) = \min\left(1, \frac{p_\theta(x)}{q_\phi(x)}\right)\]
\[p_{\mathrm{resample}}(x) \propto \max\bigl(0,\ p_\theta(x) - q_\phi(x)\bigr)\]
MTP
\[p(x_{t+1:t+k} \mid x_{\le t}) = \prod_{j=1}^{k} p_\theta(x_{t+j} \mid \mathbf{H}_t)\]
\[\mathcal{L}_{\mathrm{MTP}} = - \sum_{t} \sum_{j=1}^{k} \log p_\theta(x_{t+j} \mid \mathbf{H}_t)\]