Created
February 21, 2026 10:51
-
-
Save DannyArends/12704c9207797a64338a5be4f1010bcf to your computer and use it in GitHub Desktop.
Andrej Karpathy's microgpt.py translated into the D programming language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| The most atomic way to train and run inference for a GPT in pure, dependency-free D. | |
| This file is the complete algorithm, | |
| Everything else is just efficiency. | |
| @karpathy | |
| Translation by @DannyArends | |
| */ | |
| import std.algorithm : countUntil, fold, joiner, map, maxElement, min, sort, sum, uniq; | |
| import std.array : array; | |
| import std.file : exists, readText, write; | |
| import std.format : format; | |
| import std.math : log, sqrt, cos, PI; | |
| import std.net.curl : get; | |
| import std.random : Random, uniform, randomShuffle, unpredictableSeed; | |
| import std.range : zip; | |
| import std.stdio : writef, writefln, writeln; | |
| import std.string : split; | |
| import std.utf : byDchar; | |
| class Value { | |
| double data; | |
| double grad = 0; | |
| Value[] children; | |
| double[] localGrads; | |
| // Init | |
| this(double data, Value[] children = [], double[] localGrads = []) { | |
| this.data = data; | |
| this.children = children; | |
| this.localGrads = localGrads; | |
| } | |
| // Unary minus | |
| Value opUnary(string op)() if (op == "-") { return this * -1.0; } | |
| // Binary ops with a double | |
| Value opBinary(string op)(double rhs) { | |
| static if (op == "^^") | |
| return new Value(data ^^ rhs, [this], [rhs * data ^^ (rhs - 1)]); | |
| else | |
| return opBinary!op(new Value(rhs)); | |
| } | |
| // Binary ops with another Value | |
| Value opBinary(string op)(Value rhs) { | |
| static if (op == "+") return new Value(data + rhs.data, [this, rhs], [1.0, 1.0]); | |
| else static if (op == "*") return new Value(data * rhs.data, [this, rhs], [rhs.data, data]); | |
| else static if (op == "-") return this + (-rhs); | |
| else static if (op == "/") return this * rhs ^^ -1.0; | |
| else static assert(0, "Unsupported op: " ~ op); | |
| } | |
| // opBinaryRight ops with a double | |
| Value opBinaryRight(string op)(double lhs) { | |
| static if (op == "+") return this + lhs; | |
| else static if (op == "*") return this * lhs; | |
| else static if (op == "-") return new Value(lhs) - this; | |
| else static if (op == "/") return new Value(lhs) / this; | |
| else static assert(0, "Unsupported op: " ~ op); | |
| } | |
| // Non-operator methods | |
| Value log() { import std.math : log; return new Value(log(data), [this], [1.0 / data]); } | |
| Value exp() { import std.math : exp; return new Value(exp(data), [this], [exp(data)]); } | |
| Value relu() { return new Value(data > 0 ? data : 0, [this], [data > 0 ? 1.0 : 0.0]); } | |
| // Backpropagation | |
| void backward() { | |
| Value[] topo; | |
| bool[Value] visited; | |
| void buildTopo(Value v) { | |
| if (v in visited) return; | |
| visited[v] = true; | |
| foreach (child; v.children) { buildTopo(child); } | |
| topo ~= v; | |
| } | |
| buildTopo(this); | |
| this.grad = 1.0; | |
| foreach_reverse (v; topo) { | |
| foreach (i, child; v.children) { child.grad += v.localGrads[i] * v.grad; } | |
| } | |
| } | |
| } | |
| double gaussRandom(ref Random rng, double mu, double std) { | |
| double u1 = uniform!("(]")(0.0, 1.0, rng); | |
| double u2 = uniform!("[]")(0.0, 1.0, rng); | |
| double z = sqrt(-2.0 * log(u1)) * cos(2.0 * PI * u2); | |
| return mu + std * z; | |
| } | |
| Value[][] matrix(size_t nout, size_t nin, double std = 0.08) { | |
| auto rng = Random(unpredictableSeed); | |
| Value[][] result = new Value[][](nout); | |
| foreach (i; 0..nout) { | |
| result[i] = new Value[](nin); | |
| foreach (j; 0..nin) { result[i][j] = new Value(gaussRandom(rng, 0.0, std)); } | |
| } | |
| return result; | |
| } | |
| Value[] linear(Value[] x, Value[][] w) { | |
| return w.map!(wo => zip(wo, x).map!(t => t[0] * t[1]).fold!((a, b) => a + b)(new Value(0.0))).array; | |
| } | |
| Value[] softmax(Value[] logits) { | |
| double maxVal = logits.map!(v => v.data).maxElement; | |
| Value[] exps = logits.map!(v => (v - maxVal).exp()).array; | |
| Value total = exps.fold!((a, b) => a + b)(new Value(0.0)); | |
| Value[] result = exps.map!(e => e / total).array; | |
| return result; | |
| } | |
| Value[] rmsnorm(Value[] x) { | |
| Value ms = x.map!(xi => xi * xi).fold!((a, b) => a + b)(new Value(0.0)) / cast(double)x.length; | |
| Value scale = (ms + 1e-5) ^^ -0.5; | |
| return x.map!(xi => xi * scale).array; | |
| } | |
| Value[] gpt(Value[][][string] stateDict, size_t nLayer, size_t nHead, size_t headDim, size_t tokenId, size_t posId, Value[][][] keys, Value[][][] values) { | |
| Value[] tokEmb = stateDict["wte"][tokenId]; | |
| Value[] posEmb = stateDict["wpe"][posId]; | |
| Value[] x = rmsnorm(zip(tokEmb, posEmb).map!(t => t[0] + t[1]).array); | |
| foreach (li; 0..nLayer) { | |
| // 1) Multi-head Attention block | |
| Value[] xResidual = x; | |
| x = rmsnorm(x); | |
| // Project input into query, key, value spaces | |
| Value[] q = linear(x, stateDict[format("layer%d.attn_wq", li)]); | |
| Value[] k = linear(x, stateDict[format("layer%d.attn_wk", li)]); | |
| Value[] v = linear(x, stateDict[format("layer%d.attn_wv", li)]); | |
| // Append k,v to cache (enables attending to all previous tokens) | |
| keys[li] ~= k; | |
| values[li] ~= v; | |
| Value[] xAttn; | |
| foreach (h; 0..nHead) { | |
| size_t hs = h * headDim; | |
| // Slice out this head's portion of q, k, v | |
| Value[] qH = q[hs..hs+headDim]; | |
| Value[][] kH, vH; | |
| foreach (ki; keys[li]) { kH ~= ki[hs..hs+headDim]; } | |
| foreach (vi; values[li]) { vH ~= vi[hs..hs+headDim]; } | |
| // Dot product of query against all past keys, scaled to prevent vanishing gradients | |
| Value[] attnLogits; | |
| foreach (t; 0..kH.length) { | |
| Value s = new Value(0.0); | |
| foreach (j; 0..headDim) { | |
| s = s + qH[j] * kH[t][j]; | |
| } | |
| attnLogits ~= s / (headDim ^^ 0.5); | |
| } | |
| // Convert logits to probabilities — how much to attend to each past tokenId | |
| Value[] attnWeights = softmax(attnLogits); | |
| // Weighted sum of values — the actual attended information | |
| foreach (j; 0..headDim) { | |
| Value s = new Value(0.0); | |
| foreach (t; 0..vH.length) { | |
| s = s + attnWeights[t] * vH[t][j]; | |
| } | |
| xAttn ~= s; | |
| } | |
| } | |
| // Project concatenated head outputs back to embedding dim | |
| x = linear(xAttn, stateDict[format("layer%d.attn_wo", li)]); | |
| x = zip(x, xResidual).map!(t => t[0] + t[1]).array; | |
| // 2) MLP block | |
| xResidual = x; | |
| x = rmsnorm(x); | |
| x = linear(x, stateDict[format("layer%d.mlp_fc1", li)]); | |
| foreach (i; 0..x.length) { x[i] = x[i].relu(); } | |
| x = linear(x, stateDict[format("layer%d.mlp_fc2", li)]); | |
| x = zip(x, xResidual).map!(t => t[0] + t[1]).array; | |
| } | |
| return linear(x, stateDict["lm_head"]); | |
| } | |
| size_t weightedChoice(double[] weights, ref Random rng) { | |
| double r = uniform(0.0, weights.sum, rng); | |
| double cumulative = 0; | |
| foreach (i, w; weights) { | |
| cumulative += w; | |
| if (r <= cumulative) return i; | |
| } | |
| return weights.length - 1; | |
| } | |
| int main(string[] args) { | |
| string path = "./data/input.txt"; | |
| if(!path.exists()){ | |
| auto content = get("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"); | |
| //auto content = get("https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt"); | |
| path.write(content); | |
| writefln("Downloaded %d bytes, saved to %s", content.length, path); | |
| } | |
| string content = readText(path); | |
| string[] docs = content.split("\n"); | |
| docs = docs.randomShuffle(); | |
| writefln("Loaded %d lines", docs.length); | |
| auto uchars = content.byDchar.array.sort().uniq().array; | |
| auto BOS = uchars.length; | |
| auto vocabSize = uchars.length + 1; | |
| writefln("vocab size: %d tokens", vocabSize); | |
| size_t nLayer = 1; // depth of the transformer neural network (number of layers) | |
| size_t nEmbd = 16; // width of the network (embedding dimension) | |
| size_t blockSize = 16; // maximum context length of the attention window (note: the longest name is 15 characters) | |
| size_t nHead = 4; // number of attention heads | |
| size_t headDim = nEmbd / nHead; // derived dimension of each head | |
| size_t AIAYN = 4; // Attention Is All You Need - MLP expansion factor | |
| Value[][][string] stateDict = [ | |
| "wte": matrix(vocabSize, nEmbd), | |
| "wpe": matrix(blockSize, nEmbd), | |
| "lm_head": matrix(vocabSize, nEmbd) | |
| ]; | |
| foreach (i; 0..nLayer) { | |
| stateDict[format("layer%d.attn_wq", i)] = matrix(nEmbd, nEmbd); | |
| stateDict[format("layer%d.attn_wk", i)] = matrix(nEmbd, nEmbd); | |
| stateDict[format("layer%d.attn_wv", i)] = matrix(nEmbd, nEmbd); | |
| stateDict[format("layer%d.attn_wo", i)] = matrix(nEmbd, nEmbd); | |
| stateDict[format("layer%d.mlp_fc1", i)] = matrix(AIAYN * nEmbd, nEmbd); | |
| stateDict[format("layer%d.mlp_fc2", i)] = matrix(nEmbd, AIAYN * nEmbd); | |
| } | |
| // flatten | |
| Value[] params = stateDict.values.joiner.joiner.array; | |
| writefln("Num parameters: %d", params.length); | |
| int numSteps = 1000; | |
| double learningRate = 0.001, beta1 = 0.85, beta2 = 0.99, epsAdam = 1e-8; | |
| double[] m = new double[](params.length); m[] = 0.0; // Adam first moment | |
| double[] v = new double[](params.length); v[] = 0.0; // Adam second moment | |
| foreach (step; 0..numSteps) { | |
| // Tokenize document | |
| string doc = docs[step % docs.length]; | |
| ulong[] tokens = [BOS]; | |
| foreach (dchar ch; doc) { | |
| auto idx = uchars.countUntil(ch); | |
| if (idx >= 0) tokens ~= cast(ulong)idx; | |
| } | |
| tokens ~= BOS; | |
| int n = cast(int)min(blockSize, tokens.length - 1); | |
| // Forward pass | |
| Value[][][] keys = new Value[][][](nLayer); | |
| Value[][][] values = new Value[][][](nLayer); | |
| Value[] losses; | |
| foreach (posId; 0..n) { | |
| size_t tokenId = tokens[posId]; | |
| size_t targetId = tokens[posId + 1]; | |
| Value[] logits = gpt(stateDict, nLayer, nHead, headDim, tokenId, posId, keys, values); | |
| Value[] probs = softmax(logits); | |
| losses ~= probs[targetId].log() * -1.0; | |
| } | |
| Value loss = losses.fold!((a, b) => a + b)(new Value(0.0)) / cast(double)n; | |
| // Backward | |
| loss.backward(); | |
| // Adam update | |
| double lrT = learningRate * (1.0 - cast(double)step / numSteps); | |
| foreach (i, p; params) { | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad; | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad ^^ 2; | |
| double mHat = m[i] / (1 - beta1 ^^ (step + 1)); | |
| double vHat = v[i] / (1 - beta2 ^^ (step + 1)); | |
| double update = lrT * mHat / (vHat ^^ 0.5 + epsAdam); | |
| p.data -= update; | |
| p.grad = 0; | |
| } | |
| writef("\rstep %4d / %4d | loss %.4f", step + 1, numSteps, loss.data); | |
| } | |
| double temperature = 0.5; | |
| writeln("\n--- inference (new, hallucinated names) ---"); | |
| auto rng = Random(unpredictableSeed); | |
| foreach (sampleIdx; 0 .. 20) { | |
| Value[][][] ikeys = new Value[][][](nLayer); | |
| Value[][][] ivalues = new Value[][][](nLayer); | |
| size_t tokenId = BOS; | |
| string sample; | |
| foreach (posId; 0..blockSize) { | |
| Value[] logits = gpt(stateDict, nLayer, nHead, headDim, tokenId, posId, ikeys, ivalues); | |
| Value[] scaled = logits.map!(l => l / temperature).array; | |
| Value[] probs = softmax(scaled); | |
| tokenId = weightedChoice(probs.map!(p => p.data).array, rng); | |
| if (tokenId == BOS) break; | |
| sample ~= uchars[tokenId]; | |
| } | |
| writefln("sample %2d: %s", sampleIdx + 1, sample); | |
| } | |
| return(0); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment