Build your own GPT
We are going to write a mini GPT from scratch in javascript. We’ll start with the smallest possible GPT-shaped text model, then keep replacing the toy pieces with the real ones. One file including training and inference, no dependencies, following the GPT2 architecure, based on Andrej Karpathy's microgpt.
Coming soon:
- Backpropagation
- Softmax & Log loss
- Embeddings
- Hidden layers
- Attention
- and more
Tokens
We are building a text model. Text models see text as a sequence of tokens. Tokens are the atomic parts that the model reads and writes. They could be characters, words or something in between.
To keep it simple, for our model we’ll start with six tokens: ["a", "i", "l", "e", "n", "."].
A model like GPT4 uses around 100k different tokens. You can try OpenAI’s tokenizer to understand better how each model tokenize text.
// we are going to use 6 tokens:let vocab = ["a", "i", "l", "e", "n", "."]function generate() {let doc = ""// ...return doc}console.log(generate())
We are building a generative model. Given some tokens, the model predicts the next one, then repeats until it predicts the END OF SEQUENCE token. In our case, that’s ”.”.
A full run of that loop gives us one complete sequence of tokens. We call that a document. In the real world, documents can be things like web pages, PDFs, emails, or book chapters. Here, we start with tiny strings made from vocab.
For now, the model is just picking tokens at random. That makes it like rolling a die over and over, with each roll picking the next token:
The rest of this post is about making that next-token prediction less random.
// we are going to use 6 tokens:let vocab = ["a", "i", "l", "e", "n", "."]function generate() {let doc = ""// ...return doc}console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]function generate() {let doc = ""while (true) {let tokenIndex = Math.floor(Math.random() * vocab.length,)let token = vocab[tokenIndex]if (token === ".") breakdoc += token}return doc}console.log(generate())
Logits
Instead of picking the next token directly, we'll get more control by first assigning a score to each token. We call these scores logits, and they let us compare all possible next tokens before choosing one.
Once we have the logits, we turn them into probabilities by dividing each one by the total sum.
This works for now because our logits are positive random scores. Later we’ll replace this shortcut with softmax, the standard way to turn real logits into probabilities.
let vocab = ["a", "i", "l", "e", "n", "."]function generate() {let doc = ""while (true) {let tokenIndex = Math.floor(Math.random() * vocab.length,)let token = vocab[tokenIndex]if (token === ".") breakdoc += token}return doc}console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]function logitsToProbs(logits) {let sum = logits.reduce((s, logit) => s + logit, 0)return logits.map((logit) => logit / sum)}function generate() {let doc = ""while (true) {let logits = vocab.map(() => Math.random())let probs = logitsToProbs(logits)// tokenIndex = ?let token = vocab[tokenIndex]if (token === ".") breakdoc += token}return doc}console.log(generate())
Now we need to pick the next token from probs.
We do that in the pickIndex() function. It turns the probabilities into cumulative sums, picks a random number between 0 and 1, and finds where that number lands. For example:
probs = [0.3, 0.1, 0.1, 0.1, 0.2, 0.2]// we calculate the cumulative sums:sums = [0.3, 0.4, 0.5, 0.6, 0.8, 1]// we get a random target between 0 and 1:target = 0.53// index of first sum greater than target:tokenIndex = 3// use the token with that index:token = "l"
You can think of it like spinning a wheel of fortune: every token gets a slice, and bigger probabilities get bigger slices:
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {function generate() {let doc = ""while (true) {let logits = vocab.map(() => Math.random())let probs = logitsToProbs(logits)// tokenIndex = ?let token = vocab[tokenIndex]if (token === ".") breakdoc += token}return doc}console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {function pickIndex(probs) {let total = 0let sums = probs.map((p) => (total += p))let target = Math.random()return sums.findIndex((s) => target < s)}function generate() {let doc = ""while (true) {let logits = vocab.map(() => Math.random())let probs = logitsToProbs(logits)let tokenIndex = pickIndex(probs)let token = vocab[tokenIndex]if (token === ".") breakdoc += token}return doc}console.log(generate())
Weights
So far, the logits were just random numbers created inside the loop. To make the model stable we move those numbers out of the loop and store them in an array called weights. Those weights are the values we will later adjust to make the model better.
A real GPT works the same way but at a larger scale. Instead of 6 weights, it has billions.
We also move that logic into a gpt(context) function. It ignores context for now, but it has the same shape as a real GPT: given a sequence of tokens, return logits for the next token.
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {... }function pickIndex(probs) {function generate() {let doc = ""while (true) {let logits = vocab.map(() => Math.random())let probs = logitsToProbs(logits)let tokenIndex = pickIndex(probs)let token = vocab[tokenIndex]if (token === ".") breakdoc += token}return doc}console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {let weights = vocab.map(() => Math.random())function gpt(context) {return weights}... }function pickIndex(probs) {function generate() {let doc = ""while (true) {let logits = gpt(doc)let probs = logitsToProbs(logits)let tokenIndex = pickIndex(probs)let token = vocab[tokenIndex]if (token === ".") breakdoc += token}return doc}console.log(generate())
Before we can improve the model, we need a way to measure how well it is doing.
Let’s say we want a model that generates names using the five letters from vocab. To test how well it does, we take a real name, and check how much probability it gives to the correct next token at each position. That tells us how likely the model thinks the name is.
For example, here are the probabilities for the name "ann":
To make the numbers reproducible, we removed the random initialization of weights
High probabilities mean the model expects that name, so it is doing well. Low probabilities mean the name feels unexpected, so the model is doing poorly.
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {let weights = vocab.map(() => Math.random())function gpt(context) {return weights}... }function pickIndex(probs) {... }function generate() {console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}// "ann" probabilities:console.log(logitsToProbs(gpt("")))console.log(logitsToProbs(gpt("a")))console.log(logitsToProbs(gpt("an")))console.log(logitsToProbs(gpt("ann")))... }function pickIndex(probs) {... }function generate() {console.log(generate())
Loss
We want to capture that unexpectedness into one value, so we can later work on minimizing it.
We call that value the loss: a single score that tells us how badly the model predicted the name.
For now, we use a very simple loss for each position: if the model give the correct token probability p, the loss is 1 - p. Then we average those values across the whole name.
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}// "ann" probabilities:console.log(logitsToProbs(gpt("")))console.log(logitsToProbs(gpt("a")))console.log(logitsToProbs(gpt("an")))console.log(logitsToProbs(gpt("ann")))... }function pickIndex(probs) {... }function generate() {console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}function calcLoss(doc) {let losses = []for (let pos = 0; pos <= doc.length; pos++) {let context = doc.slice(0, pos)let target = doc[pos] || "."let targetIndex = vocab.indexOf(target)let logits = gpt(context)let probs = logitsToProbs(logits)let targetProb = probs[targetIndex]let loss = 1 - targetProblosses.push(loss)}return (losses.reduce((sum, loss) => sum + loss, 0) /losses.length)}console.log(calcLoss("ann"))... }function pickIndex(probs) {... }function generate() {console.log(generate())
Gradients
To improve the model we need to minimize the loss.
One way to do that is to take one weight, add a very small value to it, and compute the loss again. If the loss goes down, that means increasing that weight helped. If the loss goes up, increasing that weight made things worse.
let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]let oldLoss = calcLoss("ann") // 0.8868weights[0] += 0.01let newLoss = calcLoss("ann") // 0.8864let gradient = (newLoss - oldLoss) / 0.01 // -0.04
Measuring how the loss changes when we nudge one weight gives us a gradient.
In getGradients() we repeat the process and get the gradients for each weight.
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {console.log(calcLoss("ann"))... }function pickIndex(probs) {... }function generate() {console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {function getGradients(doc) {const EPSILON = 0.01let baseWeights = [...weights]let oldLoss = calcLoss(doc)let gradients = []for (let i = 0; i < weights.length; i++) {weights = [...baseWeights]weights[i] += EPSILONlet newLoss = calcLoss(doc)gradients.push((newLoss - oldLoss) / EPSILON)}weights = baseWeightsreturn gradients}... }function pickIndex(probs) {... }function generate() {console.log(generate())
Training
Now that we have one gradient for each weight, we can use them to update the model.
for (let i = 0; i < weights.length; i++) {weights[i] -= LEARN_RATE * gradients[i]}
Positive gradients push weights down, negative gradients push weights up.
LEARN_RATE makes each update more conservative, so we don’t overshoot and make the loss worse.
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {function getGradients(doc) {const EPSILON = 0.01let baseWeights = [...weights]let oldLoss = calcLoss(doc)let gradients = []for (let i = 0; i < weights.length; i++) {weights = [...baseWeights]weights[i] += EPSILONlet newLoss = calcLoss(doc)gradients.push((newLoss - oldLoss) / EPSILON)}weights = baseWeightsreturn gradients}... }function pickIndex(probs) {... }function generate() {console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {function getGradients(doc) {const EPSILON = 0.01let baseWeights = [...weights]let oldLoss = calcLoss(doc)let gradients = []for (let i = 0; i < weights.length; i++) {weights = [...baseWeights]weights[i] += EPSILONlet newLoss = calcLoss(doc)gradients.push((newLoss - oldLoss) / EPSILON)}weights = baseWeightsreturn gradients}function train(doc) {const STEPS = 75const LEARN_RATE = 0.1for (let step = 0; step < STEPS; step++) {let gradients = getGradients(doc)for (let i = 0; i < weights.length; i++) {weights[i] -= LEARN_RATE * gradients[i]}console.log(step, calcLoss(doc).toFixed(6))}}train("ann")... }function pickIndex(probs) {... }function generate() {console.log(generate())
So far, we only used one name to train the model. A real GPT trains on a huge number of documents. For our model, we’ll use a tiny dataset of names.
Now each training step picks one document from docs, calculates the gradients for that document, and updates the weights. Then the next step uses another document.
This does not make the model much smarter yet, but it makes the training loop more realistic: pick a document, measure loss, update weights, repeat.
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {... }function getGradients(doc) {function train(doc) {const STEPS = 75const LEARN_RATE = 0.1for (let step = 0; step < STEPS; step++) {let gradients = getGradients(doc)for (let i = 0; i < weights.length; i++) {weights[i] -= LEARN_RATE * gradients[i]}console.log(step, calcLoss(doc).toFixed(6))}}train("ann")... }function pickIndex(probs) {... }function generate() {console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {... }function getGradients(doc) {let docs = ["ann","leia","elaine","ian","alan","laila","lee","nina",]function train() {const STEPS = 75const LEARN_RATE = 0.1for (let step = 0; step < STEPS; step++) {let doc = docs[step % docs.length]let gradients = getGradients(doc)for (let i = 0; i < weights.length; i++) {weights[i] -= LEARN_RATE * gradients[i]}console.log(step, calcLoss(doc).toFixed(4))}}train()... }function pickIndex(probs) {... }function generate() {console.log(generate())
I'm working on the rest of the post. Follow @pomber for updates.
Coming soon:
- Backpropagation
- Softmax & Log loss
- Embeddings
- Hidden layers
- Attention
- and more
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {... }function getGradients(doc) {"ann", "leia", ... ]let docs = [... }function train() {train()... }function pickIndex(probs) {... }function generate() {console.log(generate())
let vocab = ["a", "i", "l", "e", "n", "."]... }function logitsToProbs(logits) {// let weights = vocab.map(() => Math.random())let weights = [0.62, 0.45, 0.85, 0.67, 0.17, 0.53]function gpt(context) {return weights}... }function calcLoss(doc) {class Value {backward() {// to be continued...}}... }function getGradients(doc) {"ann", "leia", ... ]let docs = [... }function train() {train()... }function pickIndex(probs) {... }function generate() {console.log(generate())