Skip to content

Instantly share code, notes, and snippets.

@justinpaulson
Last active February 20, 2026 21:29
Show Gist options
  • Select an option

  • Save justinpaulson/3626224a462d0d4f07ff42d3e5aab512 to your computer and use it in GitHub Desktop.

Select an option

Save justinpaulson/3626224a462d0d4f07ff42d3e5aab512 to your computer and use it in GitHub Desktop.
MicroGPT in Ruby — a complete GPT training and inference implementation in pure, dependency-free Ruby. Re-creation of @karpathy's microgpt.py
# frozen_string_literal: true
# MicroGPT in Ruby — optimized version
# Original Python by @karpathy, Ruby port + optimizations
require 'open-uri'
srand(42)
unless File.exist?('input.txt')
names_url = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt'
File.write('input.txt', URI.open(names_url).read)
end
docs = File.readlines('input.txt').map(&:strip).reject(&:empty?)
docs.shuffle!
puts "num docs: #{docs.length}"
uchars = docs.join.chars.uniq.sort
BOS = uchars.length
VOCAB_SIZE = uchars.length + 1
puts "vocab size: #{VOCAB_SIZE}"
# Optimized Value class:
# - Specialized add_scalar/mul_scalar/sub_scalar to avoid wrapping constants in Value objects
# - Use Hash instead of Set for visited tracking in backward
# - Avoid zip in backward (use index-based iteration)
class Value
attr_accessor :data, :grad
attr_reader :_children, :_local_grads
def initialize(data, children = nil, local_grads = nil)
@data = data.to_f
@grad = 0.0
@_children = children
@_local_grads = local_grads
end
def +(other)
if other.is_a?(Value)
Value.new(@data + other.data, [self, other], [1.0, 1.0])
else
Value.new(@data + other, [self], [1.0])
end
end
def *(other)
if other.is_a?(Value)
Value.new(@data * other.data, [self, other], [other.data, @data])
else
Value.new(@data * other, [self], [other.to_f])
end
end
def **(other)
Value.new(@data**other, [self], [other * @data**(other - 1)])
end
def log
Value.new(Math.log(@data), [self], [1.0 / @data])
end
def exp
e = Math.exp(@data)
Value.new(e, [self], [e])
end
def relu
Value.new(@data > 0.0 ? @data : 0.0, [self], [@data > 0.0 ? 1.0 : 0.0])
end
# Subtract a scalar (Float/Integer) without wrapping it in Value
def sub_scalar(s)
Value.new(@data - s, [self], [1.0])
end
def -@
Value.new(-@data, [self], [-1.0])
end
def -(other)
if other.is_a?(Value)
Value.new(@data - other.data, [self, other], [1.0, -1.0])
else
Value.new(@data - other, [self], [1.0])
end
end
def /(other)
if other.is_a?(Value)
self * (other**-1)
else
# Division by scalar: just multiply by reciprocal
Value.new(@data / other, [self], [1.0 / other])
end
end
def coerce(other)
[Value.new(other), self]
end
def backward
topo = []
visited = {}
stack = [self]
until stack.empty?
v = stack.last
vid = v.object_id
unless visited.key?(vid)
visited[vid] = false # mark as seen but not finished
children = v._children
if children
children.each do |child|
stack.push(child) unless visited.key?(child.object_id)
end
end
# If we're still on top after pushing children, we can process
if stack.last.equal?(v)
stack.pop
visited[vid] = true
topo << v
end
else
stack.pop
unless visited[vid]
visited[vid] = true
topo << v
end
end
end
@grad = 1.0
i = topo.size - 1
while i >= 0
v = topo[i]
children = v._children
if children
local_grads = v._local_grads
vgrad = v.grad
j = 0
while j < children.size
children[j].grad += local_grads[j] * vgrad
j += 1
end
end
i -= 1
end
end
end
N_LAYER = 1
N_EMBD = 16
BLOCK_SIZE = 16
N_HEAD = 4
HEAD_DIM = N_EMBD / N_HEAD
INV_SQRT_HEAD_DIM = 1.0 / Math.sqrt(HEAD_DIM)
def gauss(mean, std)
u1 = rand
u2 = rand
z = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2)
mean + std * z
end
def make_matrix(nout, nin, std = 0.08)
Array.new(nout) { Array.new(nin) { Value.new(gauss(0, std)) } }
end
STATE_DICT = {
'wte' => make_matrix(VOCAB_SIZE, N_EMBD),
'wpe' => make_matrix(BLOCK_SIZE, N_EMBD),
'lm_head' => make_matrix(VOCAB_SIZE, N_EMBD)
}
# Pre-build frozen key strings to avoid repeated string interpolation
LAYER_KEYS = N_LAYER.times.map do |i|
{
wq: "layer#{i}.attn_wq".freeze,
wk: "layer#{i}.attn_wk".freeze,
wv: "layer#{i}.attn_wv".freeze,
wo: "layer#{i}.attn_wo".freeze,
fc1: "layer#{i}.mlp_fc1".freeze,
fc2: "layer#{i}.mlp_fc2".freeze,
}
end
N_LAYER.times do |i|
lk = LAYER_KEYS[i]
STATE_DICT[lk[:wq]] = make_matrix(N_EMBD, N_EMBD)
STATE_DICT[lk[:wk]] = make_matrix(N_EMBD, N_EMBD)
STATE_DICT[lk[:wv]] = make_matrix(N_EMBD, N_EMBD)
STATE_DICT[lk[:wo]] = make_matrix(N_EMBD, N_EMBD)
STATE_DICT[lk[:fc1]] = make_matrix(4 * N_EMBD, N_EMBD)
STATE_DICT[lk[:fc2]] = make_matrix(N_EMBD, 4 * N_EMBD)
end
PARAMS = STATE_DICT.values.flat_map { |mat| mat.flat_map { |row| row } }
puts "num params: #{PARAMS.length}"
# Optimized linear: index-based loop, manual accumulator (no zip, no reduce)
def linear(x, w)
xlen = x.size
w.map do |wo|
acc = wo[0] * x[0]
i = 1
while i < xlen
acc = acc + wo[i] * x[i]
i += 1
end
acc
end
end
# Optimized softmax: use sub_scalar to avoid Value wrapping of max_val
def softmax(logits)
max_val = -Float::INFINITY
logits.each { |v| max_val = v.data if v.data > max_val }
exps = logits.map { |val| val.sub_scalar(max_val).exp }
total = exps[0]
i = 1
while i < exps.size
total = total + exps[i]
i += 1
end
exps.map { |e| e / total }
end
# Optimized rmsnorm: avoid wrapping length in Value; use scalar division
def rmsnorm(x)
n = x.length
acc = x[0] * x[0]
i = 1
while i < n
acc = acc + x[i] * x[i]
i += 1
end
ms = acc / n
scale = (ms + 1e-5)**-0.5
x.map { |xi| xi * scale }
end
def gpt(token_id, pos_id, keys, values)
tok_emb = STATE_DICT['wte'][token_id]
pos_emb = STATE_DICT['wpe'][pos_id]
# Index-based addition instead of zip
x = Array.new(N_EMBD) { |i| tok_emb[i] + pos_emb[i] }
x = rmsnorm(x)
N_LAYER.times do |li|
lk = LAYER_KEYS[li]
x_residual = x
x = rmsnorm(x)
q = linear(x, STATE_DICT[lk[:wq]])
k = linear(x, STATE_DICT[lk[:wk]])
v = linear(x, STATE_DICT[lk[:wv]])
keys[li] << k
values[li] << v
x_attn = Array.new(N_EMBD)
N_HEAD.times do |h|
hs = h * HEAD_DIM
# Compute attention logits with pre-computed 1/sqrt(head_dim)
n_kv = keys[li].size
attn_logits = Array.new(n_kv)
t = 0
while t < n_kv
kt = keys[li][t]
acc = q[hs] * kt[hs]
j = 1
while j < HEAD_DIM
acc = acc + q[hs + j] * kt[hs + j]
j += 1
end
attn_logits[t] = acc * INV_SQRT_HEAD_DIM
t += 1
end
attn_weights = softmax(attn_logits)
# Compute head output
j = 0
while j < HEAD_DIM
acc = attn_weights[0] * values[li][0][hs + j]
t = 1
while t < n_kv
acc = acc + attn_weights[t] * values[li][t][hs + j]
t += 1
end
x_attn[hs + j] = acc
j += 1
end
end
x = linear(x_attn, STATE_DICT[lk[:wo]])
# Index-based residual add
i = 0
while i < N_EMBD
x[i] = x[i] + x_residual[i]
i += 1
end
# MLP block
x_residual = x
x = rmsnorm(x)
x = linear(x, STATE_DICT[lk[:fc1]])
x.map!(&:relu)
x = linear(x, STATE_DICT[lk[:fc2]])
i = 0
while i < N_EMBD
x[i] = x[i] + x_residual[i]
i += 1
end
end
linear(x, STATE_DICT['lm_head'])
end
def weighted_choice(weights)
total = weights.sum
r = rand * total
cumulative = 0.0
weights.each_with_index do |w, i|
cumulative += w
return i if r <= cumulative
end
weights.length - 1
end
# Adam optimizer
learning_rate = 0.01
beta1 = 0.85
beta2 = 0.99
eps_adam = 1e-8
m = Array.new(PARAMS.length, 0.0)
v = Array.new(PARAMS.length, 0.0)
num_params = PARAMS.length
num_steps = (ENV['NUM_STEPS'] || 1000).to_i
num_steps.times do |step|
doc = docs[step % docs.length]
tokens = [BOS] + doc.chars.map { |ch| uchars.index(ch) } + [BOS]
n = [BLOCK_SIZE, tokens.length - 1].min
keys = Array.new(N_LAYER) { [] }
vals = Array.new(N_LAYER) { [] }
losses = []
n.times do |pos_id|
token_id = tokens[pos_id]
target_id = tokens[pos_id + 1]
logits = gpt(token_id, pos_id, keys, vals)
probs = softmax(logits)
loss_t = -probs[target_id].log
losses << loss_t
end
loss_sum = losses[0]
li = 1
while li < losses.size
loss_sum = loss_sum + losses[li]
li += 1
end
loss = loss_sum / n
loss.backward
lr_t = learning_rate * (1.0 - step.to_f / num_steps)
one_minus_beta1 = 1.0 - beta1
one_minus_beta2 = 1.0 - beta2
bias_correction1 = 1.0 / (1.0 - beta1**(step + 1))
bias_correction2 = 1.0 / (1.0 - beta2**(step + 1))
i = 0
while i < num_params
p = PARAMS[i]
g = p.grad
m[i] = beta1 * m[i] + one_minus_beta1 * g
v[i] = beta2 * v[i] + one_minus_beta2 * g * g
m_hat = m[i] * bias_correction1
v_hat = v[i] * bias_correction2
p.data -= lr_t * m_hat / (Math.sqrt(v_hat) + eps_adam)
p.grad = 0.0
i += 1
end
print "\rstep #{(step + 1).to_s.rjust(4)} / #{num_steps.to_s.rjust(4)} | loss #{format('%.4f', loss.data)}"
end
# Inference
temperature = 0.5
inv_temp = 1.0 / temperature
puts "\n--- inference (new, hallucinated names) ---"
20.times do |sample_idx|
keys = Array.new(N_LAYER) { [] }
vals = Array.new(N_LAYER) { [] }
token_id = BOS
sample = []
BLOCK_SIZE.times do |pos_id|
logits = gpt(token_id, pos_id, keys, vals)
probs = softmax(logits.map { |l| l * inv_temp })
token_id = weighted_choice(probs.map(&:data))
break if token_id == BOS
sample << uchars[token_id]
end
puts "sample #{(sample_idx + 1).to_s.rjust(2)}: #{sample.join}"
end
@justinpaulson
Copy link
Author

justinpaulson commented Feb 20, 2026

Benchmarked Ruby vs Python (original) on an M-series Mac at 200 training steps:

Version Time Memory Instructions
Python 3 16.22s 58 MB 228B
Ruby (original) 9.26s 105 MB 195B
Ruby (optimized) 7.91s 127 MB 155B

Ruby optimized is 2.05x faster than Python — key optimizations:

  • Index-based loops instead of zip/reduce (avoids millions of throwaway arrays)
  • Scalar-aware +, *, -, / on Value (skip wrapping Floats in Value objects → smaller computation graph)
  • sub_scalar in softmax to avoid coerce overhead
  • Iterative topological sort in backward (no recursion)
  • Pre-frozen hash keys for state_dict lookups
  • Pre-computed 1/sqrt(head_dim) and Adam bias corrections outside inner loops

The tradeoff is memory — Ruby uses ~2x more RAM due to larger object sizes vs Python's __slots__-optimized Value class.

Original Python version by @karpathy: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment