Build your own GPT

We are going to write a mini GPT from scratch in javascript. We’ll start with the smallest possible GPT-shaped text model, then keep replacing the toy pieces with the real ones. One file including training and inference, no dependencies, following the GPT2 architecure, based on Andrej Karpathy's microgpt.

Coming soon:

  1. Backpropagation
  2. Softmax & Log loss
  3. Embeddings
  4. Hidden layers
  5. Attention
  6. and more

Tokens

We are building a text model. Text models see text as a sequence of tokens. Tokens are the atomic parts that the model reads and writes. They could be characters, words or something in between.

To keep it simple, for our model we’ll start with six tokens: ["a", "i", "l", "e", "n", "."].

A model like GPT4 uses around 100k different tokens. You can try OpenAI’s tokenizer to understand better how each model tokenize text.

// we are going to use 6 tokens:
let vocab = ["a", "i", "l", "e", "n", "."]
function generate() {
let doc = ""
// ...
return doc
}
console.log(generate())
Run

We are building a generative model. Given some tokens, the model predicts the next one, then repeats until it predicts the END OF SEQUENCE token. In our case, that’s ..

A full run of that loop gives us one complete sequence of tokens. We call that a document. In the real world, documents can be things like web pages, PDFs, emails, or book chapters. Here, we start with tiny strings made from vocab.

For now, the model is just picking tokens at random. That makes it like rolling a die over and over, with each roll picking the next token:

a
i
l
e
n
.
Click to roll

The rest of this post is about making that next-token prediction less random.

// we are going to use 6 tokens:
let vocab = ["a", "i", "l", "e", "n", "."]
function generate() {
let doc = ""
// ...
return doc
}
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function generate() {
let doc = ""
while (true) {
let tokenIndex = Math.floor(
Math.random() * vocab.length,
)
let token = vocab[tokenIndex]
if (token === ".") break
doc += token
}
return doc
}
console.log(generate())
Run

Logits

Instead of picking the next token directly, we'll get more control by first assigning a score to each token. We call these scores logits, and they let us compare all possible next tokens before choosing one.

Once we have the logits, we turn them into probabilities by dividing each one by the total sum.

This works for now because our logits are positive random scores. Later we’ll replace this shortcut with softmax, the standard way to turn real logits into probabilities.

let vocab = ["a", "i", "l", "e", "n", "."]
function generate() {
let doc = ""
while (true) {
let tokenIndex = Math.floor(
Math.random() * vocab.length,
)
let token = vocab[tokenIndex]
if (token === ".") break
doc += token
}
return doc
}
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
let sum = logits.reduce((s, logit) => s + logit, 0)
return logits.map((logit) => logit / sum)
}
function generate() {
let doc = ""
while (true) {
let logits = vocab.map(() => Math.random())
let probs = logitsToProbs(logits)
// tokenIndex = ?
let token = vocab[tokenIndex]
if (token === ".") break
doc += token
}
return doc
}
console.log(generate())
Run

Now we need to pick the next token from probs.

We do that in the pickIndex() function. It turns the probabilities into cumulative sums, picks a random number between 0 and 1, and finds where that number lands. For example:

probs = [0.3, 0.1, 0.1, 0.1, 0.2, 0.2]
// we calculate the cumulative sums:
sums = [0.3, 0.4, 0.5, 0.6, 0.8, 1]
// we get a random target between 0 and 1:
target = 0.53
// index of first sum greater than target:
tokenIndex = 3
// use the token with that index:
token = "l"

You can think of it like spinning a wheel of fortune: every token gets a slice, and bigger probabilities get bigger slices:

ailen.
Click to spin
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
function generate() {
let doc = ""
while (true) {
let logits = vocab.map(() => Math.random())
let probs = logitsToProbs(logits)
// tokenIndex = ?
let token = vocab[tokenIndex]
if (token === ".") break
doc += token
}
return doc
}
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
function pickIndex(probs) {
let total = 0
let sums = probs.map((p) => (total += p))
let target = Math.random()
return sums.findIndex((s) => target < s)
}
function generate() {
let doc = ""
while (true) {
let logits = vocab.map(() => Math.random())
let probs = logitsToProbs(logits)
let tokenIndex = pickIndex(probs)
let token = vocab[tokenIndex]
if (token === ".") break
doc += token
}
return doc
}
console.log(generate())
Run

Weights

So far, the logits were just random numbers created inside the loop. To make the model stable we move those numbers out of the loop and store them in an array called weights. Those weights are the values we will later adjust to make the model better.

A real GPT works the same way but at a larger scale. Instead of 6 weights, it has billions.

We also move that logic into a gpt(context) function. It ignores context for now, but it has the same shape as a real GPT: given a sequence of tokens, return logits for the next token.

let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
function pickIndex(probs) {
... }
function generate() {
let doc = ""
while (true) {
let logits = vocab.map(() => Math.random())
let probs = logitsToProbs(logits)
let tokenIndex = pickIndex(probs)
let token = vocab[tokenIndex]
if (token === ".") break
doc += token
}
return doc
}
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
let weights = vocab.map(() => Math.random())
function gpt(context) {
return weights
}
function pickIndex(probs) {
... }
function generate() {
let doc = ""
while (true) {
let logits = gpt(doc)
let probs = logitsToProbs(logits)
let tokenIndex = pickIndex(probs)
let token = vocab[tokenIndex]
if (token === ".") break
doc += token
}
return doc
}
console.log(generate())
Run

Before we can improve the model, we need a way to measure how well it is doing.

Let’s say we want a model that generates names using the five letters from vocab. To test how well it does, we take a real name, and check how much probability it gives to the correct next token at each position. That tells us how likely the model thinks the name is.

For example, here are the probabilities for the name "ann":

To make the numbers reproducible, we removed the random initialization of weights

a
i
l
e
n
.
gpt("")
a
0.19
0.14
0.26
0.20
0.05
0.16
gpt("a")
0.19
0.14
0.26
0.20
n
0.05
0.16
gpt("an")
0.19
0.14
0.26
0.20
n
0.05
0.16
gpt("ann")
0.19
0.14
0.26
0.20
0.05
.
0.16

High probabilities mean the model expects that name, so it is doing well. Low probabilities mean the name feels unexpected, so the model is doing poorly.

let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
let weights = vocab.map(() => Math.random())
function gpt(context) {
return weights
}
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
// "ann" probabilities:
console.log(logitsToProbs(gpt("")))
console.log(logitsToProbs(gpt("a")))
console.log(logitsToProbs(gpt("an")))
console.log(logitsToProbs(gpt("ann")))
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
Run

Loss

We want to capture that unexpectedness into one value, so we can later work on minimizing it.

We call that value the loss: a single score that tells us how badly the model predicted the name.

For now, we use a very simple loss for each position: if the model give the correct token probability p, the loss is 1 - p. Then we average those values across the whole name.

a
i
l
e
n
.
loss
gpt("")
a
0.19
0.14
0.26
0.20
0.05
0.16
1 - 0.19 = 0.81
gpt("a")
0.19
0.14
0.26
0.20
n
0.05
0.16
1 - 0.05 = 0.95
gpt("an")
0.19
0.14
0.26
0.20
n
0.05
0.16
1 - 0.05 = 0.95
gpt("ann")
0.19
0.14
0.26
0.20
0.05
.
0.16
1 - 0.16 = 0.84
"ann" loss =0.89
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
// "ann" probabilities:
console.log(logitsToProbs(gpt("")))
console.log(logitsToProbs(gpt("a")))
console.log(logitsToProbs(gpt("an")))
console.log(logitsToProbs(gpt("ann")))
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
let losses = []
for (let pos = 0; pos <= doc.length; pos++) {
let context = doc.slice(0, pos)
let target = doc[pos] || "."
let targetIndex = vocab.indexOf(target)
let logits = gpt(context)
let probs = logitsToProbs(logits)
let targetProb = probs[targetIndex]
let loss = 1 - targetProb
losses.push(loss)
}
return (
losses.reduce((sum, loss) => sum + loss, 0) /
losses.length
)
}
console.log(calcLoss("ann"))
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
Run

Gradients

To improve the model we need to minimize the loss.

One way to do that is to take one weight, add a very small value to it, and compute the loss again. If the loss goes down, that means increasing that weight helped. If the loss goes up, increasing that weight made things worse.

let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
let oldLoss = calcLoss("ann") // 0.8868
weights[0] += 0.01
let newLoss = calcLoss("ann") // 0.8864
let gradient = (newLoss - oldLoss) / 0.01 // -0.04

Measuring how the loss changes when we nudge one weight gives us a gradient.

In getGradients() we repeat the process and get the gradients for each weight.

let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
console.log(calcLoss("ann"))
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
function getGradients(doc) {
const EPSILON = 0.01
let baseWeights = [...weights]
let oldLoss = calcLoss(doc)
let gradients = []
for (let i = 0; i < weights.length; i++) {
weights = [...baseWeights]
weights[i] += EPSILON
let newLoss = calcLoss(doc)
gradients.push((newLoss - oldLoss) / EPSILON)
}
weights = baseWeights
return gradients
}
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
Run

Training

Now that we have one gradient for each weight, we can use them to update the model.

for (let i = 0; i < weights.length; i++) {
weights[i] -= LEARN_RATE * gradients[i]
}

Positive gradients push weights down, negative gradients push weights up.

LEARN_RATE makes each update more conservative, so we don’t overshoot and make the loss worse.

a
i
l
e
n
.
loss
gradient
base
0.62
0.45
0.85
0.67
0.17
0.53
0.8868
a
0.63
0.45
0.85
0.67
0.17
0.53
0.8864
-0.04
i
0.62
0.46
0.85
0.67
0.17
0.53
0.8871
+0.03
l
0.62
0.45
0.86
0.67
0.17
0.53
0.8871
+0.03
e
0.62
0.45
0.85
0.68
0.17
0.53
0.8871
+0.03
n
0.62
0.45
0.85
0.67
0.18
0.53
0.8856
-0.12
.
0.62
0.45
0.85
0.67
0.17
0.54
0.8864
-0.04
new
0.624
0.447
0.847
0.667
0.182
0.534
0.8847
1/75
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
function getGradients(doc) {
const EPSILON = 0.01
let baseWeights = [...weights]
let oldLoss = calcLoss(doc)
let gradients = []
for (let i = 0; i < weights.length; i++) {
weights = [...baseWeights]
weights[i] += EPSILON
let newLoss = calcLoss(doc)
gradients.push((newLoss - oldLoss) / EPSILON)
}
weights = baseWeights
return gradients
}
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
function getGradients(doc) {
const EPSILON = 0.01
let baseWeights = [...weights]
let oldLoss = calcLoss(doc)
let gradients = []
for (let i = 0; i < weights.length; i++) {
weights = [...baseWeights]
weights[i] += EPSILON
let newLoss = calcLoss(doc)
gradients.push((newLoss - oldLoss) / EPSILON)
}
weights = baseWeights
return gradients
}
function train(doc) {
const STEPS = 75
const LEARN_RATE = 0.1
for (let step = 0; step < STEPS; step++) {
let gradients = getGradients(doc)
for (let i = 0; i < weights.length; i++) {
weights[i] -= LEARN_RATE * gradients[i]
}
console.log(step, calcLoss(doc).toFixed(6))
}
}
train("ann")
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
Run

So far, we only used one name to train the model. A real GPT trains on a huge number of documents. For our model, we’ll use a tiny dataset of names.

Now each training step picks one document from docs, calculates the gradients for that document, and updates the weights. Then the next step uses another document.

This does not make the model much smarter yet, but it makes the training loop more realistic: pick a document, measure loss, update weights, repeat.

let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
function getGradients(doc) {
... }
function train(doc) {
const STEPS = 75
const LEARN_RATE = 0.1
for (let step = 0; step < STEPS; step++) {
let gradients = getGradients(doc)
for (let i = 0; i < weights.length; i++) {
weights[i] -= LEARN_RATE * gradients[i]
}
console.log(step, calcLoss(doc).toFixed(6))
}
}
train("ann")
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
function getGradients(doc) {
... }
let docs = [
"ann",
"leia",
"elaine",
"ian",
"alan",
"laila",
"lee",
"nina",
]
function train() {
const STEPS = 75
const LEARN_RATE = 0.1
for (let step = 0; step < STEPS; step++) {
let doc = docs[step % docs.length]
let gradients = getGradients(doc)
for (let i = 0; i < weights.length; i++) {
weights[i] -= LEARN_RATE * gradients[i]
}
console.log(step, calcLoss(doc).toFixed(4))
}
}
train()
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
Run

I'm working on the rest of the post. Follow @pomber for updates.

Coming soon:

  1. Backpropagation
  2. Softmax & Log loss
  3. Embeddings
  4. Hidden layers
  5. Attention
  6. and more
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
function getGradients(doc) {
... }
let docs = [
"ann", "leia", ... ]
function train() {
... }
train()
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]
function logitsToProbs(logits) {
... }
// let weights = vocab.map(() => Math.random())
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]
function gpt(context) {
return weights
}
function calcLoss(doc) {
... }
class Value {
backward() {
// to be continued...
}
}
function getGradients(doc) {
... }
let docs = [
"ann", "leia", ... ]
function train() {
... }
train()
function pickIndex(probs) {
... }
function generate() {
... }
console.log(generate())
Run