Skip to content

Instantly share code, notes, and snippets.

@DannyArends
Created February 21, 2026 10:51
Show Gist options
  • Select an option

  • Save DannyArends/12704c9207797a64338a5be4f1010bcf to your computer and use it in GitHub Desktop.

Select an option

Save DannyArends/12704c9207797a64338a5be4f1010bcf to your computer and use it in GitHub Desktop.
Andrej Karpathy's microgpt.py translated into the D programming language
/*
The most atomic way to train and run inference for a GPT in pure, dependency-free D.
This file is the complete algorithm,
Everything else is just efficiency.
@karpathy
Translation by @DannyArends
*/
import std.algorithm : countUntil, fold, joiner, map, maxElement, min, sort, sum, uniq;
import std.array : array;
import std.file : exists, readText, write;
import std.format : format;
import std.math : log, sqrt, cos, PI;
import std.net.curl : get;
import std.random : Random, uniform, randomShuffle, unpredictableSeed;
import std.range : zip;
import std.stdio : writef, writefln, writeln;
import std.string : split;
import std.utf : byDchar;
class Value {
double data;
double grad = 0;
Value[] children;
double[] localGrads;
// Init
this(double data, Value[] children = [], double[] localGrads = []) {
this.data = data;
this.children = children;
this.localGrads = localGrads;
}
// Unary minus
Value opUnary(string op)() if (op == "-") { return this * -1.0; }
// Binary ops with a double
Value opBinary(string op)(double rhs) {
static if (op == "^^")
return new Value(data ^^ rhs, [this], [rhs * data ^^ (rhs - 1)]);
else
return opBinary!op(new Value(rhs));
}
// Binary ops with another Value
Value opBinary(string op)(Value rhs) {
static if (op == "+") return new Value(data + rhs.data, [this, rhs], [1.0, 1.0]);
else static if (op == "*") return new Value(data * rhs.data, [this, rhs], [rhs.data, data]);
else static if (op == "-") return this + (-rhs);
else static if (op == "/") return this * rhs ^^ -1.0;
else static assert(0, "Unsupported op: " ~ op);
}
// opBinaryRight ops with a double
Value opBinaryRight(string op)(double lhs) {
static if (op == "+") return this + lhs;
else static if (op == "*") return this * lhs;
else static if (op == "-") return new Value(lhs) - this;
else static if (op == "/") return new Value(lhs) / this;
else static assert(0, "Unsupported op: " ~ op);
}
// Non-operator methods
Value log() { import std.math : log; return new Value(log(data), [this], [1.0 / data]); }
Value exp() { import std.math : exp; return new Value(exp(data), [this], [exp(data)]); }
Value relu() { return new Value(data > 0 ? data : 0, [this], [data > 0 ? 1.0 : 0.0]); }
// Backpropagation
void backward() {
Value[] topo;
bool[Value] visited;
void buildTopo(Value v) {
if (v in visited) return;
visited[v] = true;
foreach (child; v.children) { buildTopo(child); }
topo ~= v;
}
buildTopo(this);
this.grad = 1.0;
foreach_reverse (v; topo) {
foreach (i, child; v.children) { child.grad += v.localGrads[i] * v.grad; }
}
}
}
double gaussRandom(ref Random rng, double mu, double std) {
double u1 = uniform!("(]")(0.0, 1.0, rng);
double u2 = uniform!("[]")(0.0, 1.0, rng);
double z = sqrt(-2.0 * log(u1)) * cos(2.0 * PI * u2);
return mu + std * z;
}
Value[][] matrix(size_t nout, size_t nin, double std = 0.08) {
auto rng = Random(unpredictableSeed);
Value[][] result = new Value[][](nout);
foreach (i; 0..nout) {
result[i] = new Value[](nin);
foreach (j; 0..nin) { result[i][j] = new Value(gaussRandom(rng, 0.0, std)); }
}
return result;
}
Value[] linear(Value[] x, Value[][] w) {
return w.map!(wo => zip(wo, x).map!(t => t[0] * t[1]).fold!((a, b) => a + b)(new Value(0.0))).array;
}
Value[] softmax(Value[] logits) {
double maxVal = logits.map!(v => v.data).maxElement;
Value[] exps = logits.map!(v => (v - maxVal).exp()).array;
Value total = exps.fold!((a, b) => a + b)(new Value(0.0));
Value[] result = exps.map!(e => e / total).array;
return result;
}
Value[] rmsnorm(Value[] x) {
Value ms = x.map!(xi => xi * xi).fold!((a, b) => a + b)(new Value(0.0)) / cast(double)x.length;
Value scale = (ms + 1e-5) ^^ -0.5;
return x.map!(xi => xi * scale).array;
}
Value[] gpt(Value[][][string] stateDict, size_t nLayer, size_t nHead, size_t headDim, size_t tokenId, size_t posId, Value[][][] keys, Value[][][] values) {
Value[] tokEmb = stateDict["wte"][tokenId];
Value[] posEmb = stateDict["wpe"][posId];
Value[] x = rmsnorm(zip(tokEmb, posEmb).map!(t => t[0] + t[1]).array);
foreach (li; 0..nLayer) {
// 1) Multi-head Attention block
Value[] xResidual = x;
x = rmsnorm(x);
// Project input into query, key, value spaces
Value[] q = linear(x, stateDict[format("layer%d.attn_wq", li)]);
Value[] k = linear(x, stateDict[format("layer%d.attn_wk", li)]);
Value[] v = linear(x, stateDict[format("layer%d.attn_wv", li)]);
// Append k,v to cache (enables attending to all previous tokens)
keys[li] ~= k;
values[li] ~= v;
Value[] xAttn;
foreach (h; 0..nHead) {
size_t hs = h * headDim;
// Slice out this head's portion of q, k, v
Value[] qH = q[hs..hs+headDim];
Value[][] kH, vH;
foreach (ki; keys[li]) { kH ~= ki[hs..hs+headDim]; }
foreach (vi; values[li]) { vH ~= vi[hs..hs+headDim]; }
// Dot product of query against all past keys, scaled to prevent vanishing gradients
Value[] attnLogits;
foreach (t; 0..kH.length) {
Value s = new Value(0.0);
foreach (j; 0..headDim) {
s = s + qH[j] * kH[t][j];
}
attnLogits ~= s / (headDim ^^ 0.5);
}
// Convert logits to probabilities — how much to attend to each past tokenId
Value[] attnWeights = softmax(attnLogits);
// Weighted sum of values — the actual attended information
foreach (j; 0..headDim) {
Value s = new Value(0.0);
foreach (t; 0..vH.length) {
s = s + attnWeights[t] * vH[t][j];
}
xAttn ~= s;
}
}
// Project concatenated head outputs back to embedding dim
x = linear(xAttn, stateDict[format("layer%d.attn_wo", li)]);
x = zip(x, xResidual).map!(t => t[0] + t[1]).array;
// 2) MLP block
xResidual = x;
x = rmsnorm(x);
x = linear(x, stateDict[format("layer%d.mlp_fc1", li)]);
foreach (i; 0..x.length) { x[i] = x[i].relu(); }
x = linear(x, stateDict[format("layer%d.mlp_fc2", li)]);
x = zip(x, xResidual).map!(t => t[0] + t[1]).array;
}
return linear(x, stateDict["lm_head"]);
}
size_t weightedChoice(double[] weights, ref Random rng) {
double r = uniform(0.0, weights.sum, rng);
double cumulative = 0;
foreach (i, w; weights) {
cumulative += w;
if (r <= cumulative) return i;
}
return weights.length - 1;
}
int main(string[] args) {
string path = "./data/input.txt";
if(!path.exists()){
auto content = get("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt");
//auto content = get("https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt");
path.write(content);
writefln("Downloaded %d bytes, saved to %s", content.length, path);
}
string content = readText(path);
string[] docs = content.split("\n");
docs = docs.randomShuffle();
writefln("Loaded %d lines", docs.length);
auto uchars = content.byDchar.array.sort().uniq().array;
auto BOS = uchars.length;
auto vocabSize = uchars.length + 1;
writefln("vocab size: %d tokens", vocabSize);
size_t nLayer = 1; // depth of the transformer neural network (number of layers)
size_t nEmbd = 16; // width of the network (embedding dimension)
size_t blockSize = 16; // maximum context length of the attention window (note: the longest name is 15 characters)
size_t nHead = 4; // number of attention heads
size_t headDim = nEmbd / nHead; // derived dimension of each head
size_t AIAYN = 4; // Attention Is All You Need - MLP expansion factor
Value[][][string] stateDict = [
"wte": matrix(vocabSize, nEmbd),
"wpe": matrix(blockSize, nEmbd),
"lm_head": matrix(vocabSize, nEmbd)
];
foreach (i; 0..nLayer) {
stateDict[format("layer%d.attn_wq", i)] = matrix(nEmbd, nEmbd);
stateDict[format("layer%d.attn_wk", i)] = matrix(nEmbd, nEmbd);
stateDict[format("layer%d.attn_wv", i)] = matrix(nEmbd, nEmbd);
stateDict[format("layer%d.attn_wo", i)] = matrix(nEmbd, nEmbd);
stateDict[format("layer%d.mlp_fc1", i)] = matrix(AIAYN * nEmbd, nEmbd);
stateDict[format("layer%d.mlp_fc2", i)] = matrix(nEmbd, AIAYN * nEmbd);
}
// flatten
Value[] params = stateDict.values.joiner.joiner.array;
writefln("Num parameters: %d", params.length);
int numSteps = 1000;
double learningRate = 0.001, beta1 = 0.85, beta2 = 0.99, epsAdam = 1e-8;
double[] m = new double[](params.length); m[] = 0.0; // Adam first moment
double[] v = new double[](params.length); v[] = 0.0; // Adam second moment
foreach (step; 0..numSteps) {
// Tokenize document
string doc = docs[step % docs.length];
ulong[] tokens = [BOS];
foreach (dchar ch; doc) {
auto idx = uchars.countUntil(ch);
if (idx >= 0) tokens ~= cast(ulong)idx;
}
tokens ~= BOS;
int n = cast(int)min(blockSize, tokens.length - 1);
// Forward pass
Value[][][] keys = new Value[][][](nLayer);
Value[][][] values = new Value[][][](nLayer);
Value[] losses;
foreach (posId; 0..n) {
size_t tokenId = tokens[posId];
size_t targetId = tokens[posId + 1];
Value[] logits = gpt(stateDict, nLayer, nHead, headDim, tokenId, posId, keys, values);
Value[] probs = softmax(logits);
losses ~= probs[targetId].log() * -1.0;
}
Value loss = losses.fold!((a, b) => a + b)(new Value(0.0)) / cast(double)n;
// Backward
loss.backward();
// Adam update
double lrT = learningRate * (1.0 - cast(double)step / numSteps);
foreach (i, p; params) {
m[i] = beta1 * m[i] + (1 - beta1) * p.grad;
v[i] = beta2 * v[i] + (1 - beta2) * p.grad ^^ 2;
double mHat = m[i] / (1 - beta1 ^^ (step + 1));
double vHat = v[i] / (1 - beta2 ^^ (step + 1));
double update = lrT * mHat / (vHat ^^ 0.5 + epsAdam);
p.data -= update;
p.grad = 0;
}
writef("\rstep %4d / %4d | loss %.4f", step + 1, numSteps, loss.data);
}
double temperature = 0.5;
writeln("\n--- inference (new, hallucinated names) ---");
auto rng = Random(unpredictableSeed);
foreach (sampleIdx; 0 .. 20) {
Value[][][] ikeys = new Value[][][](nLayer);
Value[][][] ivalues = new Value[][][](nLayer);
size_t tokenId = BOS;
string sample;
foreach (posId; 0..blockSize) {
Value[] logits = gpt(stateDict, nLayer, nHead, headDim, tokenId, posId, ikeys, ivalues);
Value[] scaled = logits.map!(l => l / temperature).array;
Value[] probs = softmax(scaled);
tokenId = weightedChoice(probs.map!(p => p.data).array, rng);
if (tokenId == BOS) break;
sample ~= uchars[tokenId];
}
writefln("sample %2d: %s", sampleIdx + 1, sample);
}
return(0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment