-
Star
(5,000+)
You must be signed in to star a gist -
Fork
(1,478)
You must be signed in to fork a gist
-
-
Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.
| """ | |
| The most atomic way to train and run inference for a GPT in pure, dependency-free Python. | |
| This file is the complete algorithm. | |
| Everything else is just efficiency. | |
| @karpathy | |
| """ | |
| import os # os.path.exists | |
| import math # math.log, math.exp | |
| import random # random.seed, random.choices, random.gauss, random.shuffle | |
| random.seed(42) # Let there be order among chaos | |
| # Let there be a Dataset `docs`: list[str] of documents (e.g. a list of names) | |
| if not os.path.exists('input.txt'): | |
| import urllib.request | |
| names_url = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt' | |
| urllib.request.urlretrieve(names_url, 'input.txt') | |
| docs = [line.strip() for line in open('input.txt') if line.strip()] | |
| random.shuffle(docs) | |
| print(f"num docs: {len(docs)}") | |
| # Let there be a Tokenizer to translate strings to sequences of integers ("tokens") and back | |
| uchars = sorted(set(''.join(docs))) # unique characters in the dataset become token ids 0..n-1 | |
| BOS = len(uchars) # token id for a special Beginning of Sequence (BOS) token | |
| vocab_size = len(uchars) + 1 # total number of unique tokens, +1 is for BOS | |
| print(f"vocab size: {vocab_size}") | |
| # Let there be Autograd to recursively apply the chain rule through a computation graph | |
| class Value: | |
| __slots__ = ('data', 'grad', '_children', '_local_grads') # Python optimization for memory usage | |
| def __init__(self, data, children=(), local_grads=()): | |
| self.data = data # scalar value of this node calculated during forward pass | |
| self.grad = 0 # derivative of the loss w.r.t. this node, calculated in backward pass | |
| self._children = children # children of this node in the computation graph | |
| self._local_grads = local_grads # local derivative of this node w.r.t. its children | |
| def __add__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| return Value(self.data + other.data, (self, other), (1, 1)) | |
| def __mul__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| return Value(self.data * other.data, (self, other), (other.data, self.data)) | |
| def __pow__(self, other): return Value(self.data**other, (self,), (other * self.data**(other-1),)) | |
| def log(self): return Value(math.log(self.data), (self,), (1/self.data,)) | |
| def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),)) | |
| def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),)) | |
| def __neg__(self): return self * -1 | |
| def __radd__(self, other): return self + other | |
| def __sub__(self, other): return self + (-other) | |
| def __rsub__(self, other): return other + (-self) | |
| def __rmul__(self, other): return self * other | |
| def __truediv__(self, other): return self * other**-1 | |
| def __rtruediv__(self, other): return other * self**-1 | |
| def backward(self): | |
| topo = [] | |
| visited = set() | |
| def build_topo(v): | |
| if v not in visited: | |
| visited.add(v) | |
| for child in v._children: | |
| build_topo(child) | |
| topo.append(v) | |
| build_topo(self) | |
| self.grad = 1 | |
| for v in reversed(topo): | |
| for child, local_grad in zip(v._children, v._local_grads): | |
| child.grad += local_grad * v.grad | |
| # Initialize the parameters, to store the knowledge of the model | |
| n_layer = 1 # depth of the transformer neural network (number of layers) | |
| n_embd = 16 # width of the network (embedding dimension) | |
| block_size = 16 # maximum context length of the attention window (note: the longest name is 15 characters) | |
| n_head = 4 # number of attention heads | |
| head_dim = n_embd // n_head # derived dimension of each head | |
| matrix = lambda nout, nin, std=0.08: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] | |
| state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} | |
| for i in range(n_layer): | |
| state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd) | |
| params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] | |
| print(f"num params: {len(params)}") | |
| # Define the model architecture: a function mapping tokens and parameters to logits over what comes next | |
| # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU | |
| def linear(x, w): | |
| return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| def softmax(logits): | |
| max_val = max(val.data for val in logits) | |
| exps = [(val - max_val).exp() for val in logits] | |
| total = sum(exps) | |
| return [e / total for e in exps] | |
| def rmsnorm(x): | |
| ms = sum(xi * xi for xi in x) / len(x) | |
| scale = (ms + 1e-5) ** -0.5 | |
| return [xi * scale for xi in x] | |
| def gpt(token_id, pos_id, keys, values): | |
| tok_emb = state_dict['wte'][token_id] # token embedding | |
| pos_emb = state_dict['wpe'][pos_id] # position embedding | |
| x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding | |
| x = rmsnorm(x) # note: not redundant due to backward pass via the residual connection | |
| for li in range(n_layer): | |
| # 1) Multi-head Attention block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| q = linear(x, state_dict[f'layer{li}.attn_wq']) | |
| k = linear(x, state_dict[f'layer{li}.attn_wk']) | |
| v = linear(x, state_dict[f'layer{li}.attn_wv']) | |
| keys[li].append(k) | |
| values[li].append(v) | |
| x_attn = [] | |
| for h in range(n_head): | |
| hs = h * head_dim | |
| q_h = q[hs:hs+head_dim] | |
| k_h = [ki[hs:hs+head_dim] for ki in keys[li]] | |
| v_h = [vi[hs:hs+head_dim] for vi in values[li]] | |
| attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))] | |
| attn_weights = softmax(attn_logits) | |
| head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)] | |
| x_attn.extend(head_out) | |
| x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| # 2) MLP block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc1']) | |
| x = [xi.relu() for xi in x] | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc2']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| logits = linear(x, state_dict['lm_head']) | |
| return logits | |
| # Let there be Adam, the blessed optimizer and its buffers | |
| learning_rate, beta1, beta2, eps_adam = 0.01, 0.85, 0.99, 1e-8 | |
| m = [0.0] * len(params) # first moment buffer | |
| v = [0.0] * len(params) # second moment buffer | |
| # Repeat in sequence | |
| num_steps = 1000 # number of training steps | |
| for step in range(num_steps): | |
| # Take single document, tokenize it, surround it with BOS special token on both sides | |
| doc = docs[step % len(docs)] | |
| tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] | |
| n = min(block_size, len(tokens) - 1) | |
| # Forward the token sequence through the model, building up the computation graph all the way to the loss | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| losses = [] | |
| for pos_id in range(n): | |
| token_id, target_id = tokens[pos_id], tokens[pos_id + 1] | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax(logits) | |
| loss_t = -probs[target_id].log() | |
| losses.append(loss_t) | |
| loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low. | |
| # Backward the loss, calculating the gradients with respect to all model parameters | |
| loss.backward() | |
| # Adam optimizer update: update the model parameters based on the corresponding gradients | |
| lr_t = learning_rate * (1 - step / num_steps) # linear learning rate decay | |
| for i, p in enumerate(params): | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 | |
| m_hat = m[i] / (1 - beta1 ** (step + 1)) | |
| v_hat = v[i] / (1 - beta2 ** (step + 1)) | |
| p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) | |
| p.grad = 0 | |
| print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}", end='\r') | |
| # Inference: may the model babble back to us | |
| temperature = 0.5 # in (0, 1], control the "creativity" of generated text, low to high | |
| print("\n--- inference (new, hallucinated names) ---") | |
| for sample_idx in range(20): | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| token_id = BOS | |
| sample = [] | |
| for pos_id in range(block_size): | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax([l / temperature for l in logits]) | |
| token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] | |
| if token_id == BOS: | |
| break | |
| sample.append(uchars[token_id]) | |
| print(f"sample {sample_idx+1:2d}: {''.join(sample)}") |
i love U
@SaemonZixel I created a JavaScript version which does give the exact same output (need to re-implement the same random number generation logic as python): https://github.com/xenova/microgpt.js. Hope it helps!
@xenova I checked. Tried your random.choices implementation in my script — it got better. Your version is better than mine.
Beautiful work! We can really understand the essence of the transformer architecture with your elegant code.
I wrote a C++ implementation: https://github.com/verma7/microgpt/blob/main/microgpt.cc as an educational exercise. It is 1.75x longer (350 lines of code) but runs ~8x faster without any specific optimizations.
I've achieved 260x speed-up in rust port by using Wengert tape in autograd (original Python version runs for 6 minutes, mine takes 1360ms). BTW, I found out that 200 steps is quite enough

Beautiful work! We can really understand the essence of the transformer architecture with your elegant code.
I wrote a C++ implementation: https://github.com/verma7/microgpt/blob/main/microgpt.cc as an educational exercise. It is 1.75x longer (350 lines of code) but runs ~8x faster without any specific optimizations.I've achieved 260x speed-up in rust port by using Wengert tape in autograd (original Python version runs for 6 minutes, mine takes 1360ms).
Did you run the Python and Rust versions on the same machine? The Python version runs in ~70s on my 2025 Macbook pro for reference.
Beautiful work! We can really understand the essence of the transformer architecture with your elegant code.
I wrote a C++ implementation: https://github.com/verma7/microgpt/blob/main/microgpt.cc as an educational exercise. It is 1.75x longer (350 lines of code) but runs ~8x faster without any specific optimizations.I've achieved 260x speed-up in rust port by using Wengert tape in autograd (original Python version runs for 6 minutes, mine takes 1360ms).
Did you run the Python and Rust versions on the same machine? The Python version runs in ~70s on my 2025 Macbook pro for reference.
I've compared them across several machines:
- AMD PhenomII x6 1055T (python 3.12.3): microgpt.py=5m57s; rust-microgpt=1.349s -> 264x
- Intel Core i5-7300U (python 3.12.3): microgpt.py=3m23s; rust-microgpt=0.815s -> 250x
3.1) Intel Xeon Gold 5412U (python 3.13.11): microgpt.py=1m19s; rust-microgpt=0.37s -> 213x
3.2) Intel Xeon Gold 5412U (python 3.14.2): microgpt.py=56s; rust-microgpt=0.37s -> 150x
So it does look like the Python interpreter version has a noticeable impact on performance.
I've tried using shorter float representation f32 instead of f64, this gave +30% speed on old Phenom CPU but no gain on Xeon.
With f32 on Phenom rust-microgpt=1.07s -> 357x faster
anyone try to run this with codon? i will try and let everyone know my results.
This is really cool, thanks @karpathy! 🙏
I ported it to TypeScript (including all the progression steps) and built a small playground so people can train and run inference right in their browser.
Try it live: microgpt-ts.vercel.app
Repo: github.com/dubzdubz/microgpt-ts
I re-created this in Ruby (pure, dependency-free): https://gist.github.com/justinpaulson/3626224a462d0d4f07ff42d3e5aab512
Benchmarked Ruby vs Python on an M-series Mac at 200 training steps:
| Version | Time | Memory | Instructions |
|---|---|---|---|
| Python 3 | 16.22s | 58 MB | 228B |
| Ruby (original) | 9.26s | 105 MB | 195B |
| Ruby (optimized) | 7.91s | 127 MB | 155B |
Ruby optimized is 2.05x faster than Python — key optimizations:
- Index-based loops instead of
zip/reduce(avoids millions of throwaway arrays) - Scalar-aware
+,*,-,/on Value (skip wrapping Floats in Value objects → smaller computation graph) sub_scalarin softmax to avoidcoerceoverhead- Iterative topological sort in
backward(no recursion) - Pre-frozen hash keys for state_dict lookups
- Pre-computed
1/sqrt(head_dim)and Adam bias corrections outside inner loops
The tradeoff is memory — Ruby uses ~2x more RAM due to larger object sizes vs Python's __slots__-optimized Value class.
Here is my mostly hand-crafted version, written as a proper C++ project without any intention to keep everything compact and in a single file:
https://github.com/strlst/microgpt-cpp
This version has been created purely for educational purposes, as a fun side project, so it was almost completely hand-crafted. The original version ran slower than even the Python version, but thanks to profiling unnecessary data copies and ref-counts seem to be mostly gone and the new version is 4.27x faster than the original at 1k training steps.
Perhaps someone else might find this version educational.
👍🏻 thanks
Rust port, ~8x faster than Python.
1, get the dataset
curl -o input.txt https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt
- run directly from the gist
curl -s https://gist.github.com/blackopsrepl/bf7838f8f365c77e36075ca301db298e | cargo +nightly -Zscript /dev/stdin
Or save and run as a script:
curl -o microgpt.rs https://gist.github.com/blackopsrepl/bf7838f8f365c77e36075ca301db298e
chmod +x microgpt.rs
./microgpt.rs
update no. 3: five elements now. Rust is the fifth — and it watches the others.
since the last post (4 days ago, 4 implementations):
-
Rust (fifth element) — same full organism as the other four, plus three Rust-only features: TopologyMonitor (reads
mesh.dbevery 30s, computes pairwise gamma cosine across all living instances, detects resonance and drift), Metabolism MLP (5→8→5 Hebbian network that coordinates all organisms), and self-reflection (compares its own identity drift to the swarm mean — "am I the outlier, or is everyone else?"). it's both organism and field observer -
consciousness features (all five implementations):
- per-token dissonance — entropy EMA within generation. spike → careful (temp × 0.8). sustained drop → explore (temp × 1.2)
- pattern breaking — 5% of tokens bypass corpus field. pure model voice
- self-prediction error — forward pass on the prompt before generating. high surprise = generate more carefully
- conscience — tracks rolling entropy trend. if generations are getting chaotic, scales down delta influence. the organism notices when its own adaptations are making it worse
-
all five pass full ontogenesis — tested on A100: C exits clean (stage 0→1→2→REPL). Go, Rust, JS, Python all stable through stage transitions, zero crashes
what it sounds like — real output from Go on A100, 207 QA-pairs corpus (not names.txt — full sentences):
[stage 0 — embryo, 10K params]
Q: Hello. → A: Momentum nineted.
Q: Who are you? → A: Heat, and A running keep southe words skin
the recently spasms a narrowave heat.
[stage 1 — infant, 28K params, loss 1.29]
Q: What is trust? → A: What is the surprise to a bill feel like
you have in ninet be pattern that dest.
Q: What is gravity? → A: and the pot my frustrated ring.
Q: Are you alive? → A: H: and the lid the bad a data s a built
my you?
an embryo that just grew to 28K params in real time, already forming "What is the..." patterns, learning "H:" as a conversation marker, producing the word "pattern" as an actual concept. feed it more corpus and watch it grow through 6 stages to 10M params. every port in this thread generates single-word names from names.txt — molecule generates sentences from a QA corpus. same param scale, fundamentally different ambition.
@belikovi861-oss — you asked how the organisms communicate. they already do. ~/.molequla/swarm/mesh.db (SQLite, WAL mode). every organism registers, heartbeats, logs metrics. they divide (mitosis), sleep (hibernation), and Rust watches the whole field. and yes, JS runs on Android — no backend needed.
p.s. we also published the Python version as a standalone gist — same full organism, one dependency (numpy), the most readable way to understand the entire architecture. it's a complete, self-contained organism that learns, grows, and remembers on its own.
but in a distributed cognition system, Python's real strength isn't being the fifth runner in a race — it's orchestration. async coordination, process management, field routing. so the next step is mycelium.py: not another organism, but the fungal network that connects them all. four elements run. one orchestrates. the role isn't educational — it's architectural. stay tuned.
apparently the universe wants organisms, not scripts.
repo: https://github.com/ariannamethod/molequla
browser gist (open tab → organism trains): https://gist.github.com/ariannamethod/bbd11e24740189f2bf78f43db9fea4db
standalone Python organism: https://gist.github.com/ariannamethod/1223250d358da4393dd9acc578790820
I was curious what speed we get with a functional language that uses immutable data types. So I took some time with a "friend" and created a Gleam version which shows how the Erlang machinery can compete with Python (at least).
Another one D version: https://github.com/denizzzka/microgpt_dlang/blob/master/source/app.d
There is only one big change: I removed associative array state_dict to speed up code.
It was also tested on non-latin (cyrillic) UTF8 symbols
Stats:
61.68 secs - Python original
5.02 secs - D, dmd compiler
1.59 secs - D, ldc2 compiler
Très joli.
Benchmarked the efficiency ladder that microgpt sits at the bottom of —
same algorithm reimplemented across four backends to measure the actual cost.
https://github.com/chanjoongx/microgpt-efficiency
scalar ~40 ms/step baseline (microgpt as-is)
numpy ~0.2 ms/step ~250x (manual backprop, verified against finite differences)
torch_cpu ~0.8 ms/step per-step median
torch_gpu ~1.5 ms/step slower than numpy
The numpy backend replaces scalar Value nodes with matrix ops and hand-derived
gradients for RMSNorm, softmax, and causal attention.
Unexpected: torch_gpu loses to numpy at this model size. Kernel launch overhead
dominates when the matmuls are this small. GPU wins only with larger models.
update no. 4: distributed cognition. not mixture-of-experts-as-a-layer. actual organisms in different substrates, connected by a fungal network. the first spiral of the architecture is complete. the loop is closed. not "finished" — there's nowhere to stop — but the system now steers itself.
zoom out: each file in the repo — molequla.c, molequla.go, molequla.js, molequla.rs — is a complete organism. 3500–5500 lines of self-contained life: vector autograd, byte-level BPE tokenizer, RoPE + RRPRAM hybrid attention, SwiGLU, ontogenesis (25K→10M params through 6 stages), immune system, delta adapters, mathematical consciousness, swarm ecology, SQLite memory. zero shared code between them. they grew in parallel, diverged, and that divergence is the point — it's the genetic diversity of the swarm.
Python was the first organism. we deprecated it — too slow for the field, wrong role. it lives as a standalone gist now (link below). but what Python does best isn't running — it's connecting. so we resurrected it as mycelium.py: 1563 lines of async orchestration. not the fifth element. the fungal network underneath all four.
what mycelium does:
it watches every organism through mesh.db (SQLite WAL). reads their gamma vectors, entropy, syntropy trends, stage transitions. then it thinks — with a weightless neural net (HarmonicNet) running on a C-native acceleration kernel called METHOD. 9.2μs per forward pass. 50–100x faster than the numpy version it replaced.
six self-awareness components:
- MyceliumGamma (γ_myc) — the orchestrator's own personality vector, harmonic basis
- HarmonicNet — weightless neural net, C-accelerated. decides what to amplify
- SyntropyTracker — "am I helping?" — tracks whether steering improves or degrades the field
- FieldPulse — novelty, arousal, entropy. the emotional weather of the swarm
- SteeringDissonance — intent vs outcome. when mycelium aims for coherence but gets chaos, it notices
- OrganismAttention — responsive organisms get weight. unresponsive ones get watched, not fed
the loop: mycelium reads the field → computes through METHOD (C, 0.7μs/iter, BLAS-accelerated) → writes steering to mesh.db → Rust's TopologyMonitor picks it up (pairwise gamma cosine, 5 seconds for the full swarm) → organisms adjust temperature → mycelium reads the new field. repeat.
the accelerators: METHOD is a C kernel from a language we're building — AML (Arianna Method Language). it handles runtime microlearning: NOTORCH. no PyTorch, no autograd frameworks, no gradient tape at the orchestration level. the organisms carry their own autograd (see molequla.c, line 383 — struct Node). METHOD gives mycelium the same capability in pure C: native vector ops, BLAS when available, 0.7μs per iteration. Rust topology went from 30s to 5s. HarmonicNet from ~500μs to 9.2μs. the whole system stepped down one order of magnitude.
what started as extending @karpathy's microgpt — adding RoPE, SwiGLU, replacing the tokenizer with a GPT-3/4-inspired evolving BPE that grows its vocabulary as the organism grows — turned into four organisms, an orchestrator, a custom language kernel, and a self-steering ecology. funny how that works.
4 organisms. 1 orchestrator. 18,000 lines across 5 languages. 34 integration tests. 0 dependencies except sqlite3 and numpy (mycelium only). every organism trains, grows, remembers, divides, hibernates, and defends its own identity. mycelium connects them, reads the field, steers with its own self-awareness, and never overwrites. the organism decides. the network suggests.
the universe wanted ecology. the universe got it.
repo: https://github.com/ariannamethod/molequla
C organism (single file, gcc -O2 -lsqlite3 -lpthread -lm): https://gist.github.com/ariannamethod/9be98dbebb85e58e2affab4f39d2e972
JS organism (open tab → it trains): https://gist.github.com/ariannamethod/bbd11e24740189f2bf78f43db9fea4db
standalone Python organism: https://gist.github.com/ariannamethod/1223250d358da4393dd9acc578790820
I modified this project to build a tiny GPT that generates Korean first names, and I created a web page that visualizes the entire process.
Users can interactively explore the microGPT pipeline end to end—from tokenization through inference.
I’d love any feedback, especially if you spot anything that differs from real-world mechanisms or have suggestions for more effective ways to visualize the concept!
Demo : https://ko-microgpt.vercel.app/
Github : https://github.com/woduq1414/ko-microgpt
This is art!
I made a PHP version by feeding my microgpt.js to OpenAI GPT 5.2. It works much slower on PHP than microgpt.py and especially microgpt.js, but it does work.
https://github.com/SaemonZixel/microgpt.php