awni · February 21, 2026 16:08 · kulesh · Feb 21, 2026
diff --git a/rlm.py b/rlm.py
 """
 An LM with a REPL

 Gives an LLM a Python REPL: the model can write ```repl``` code blocks,
 which get executed, with stdout/stderr fed back into the conversation.

 Requires a running mlx_lm.server:
    mlx_lm.server
 """

 import re
 import sys
 import traceback
 from io import StringIO

 from openai import OpenAI

 SYSTEM_PROMPT = """\
 You have access to a Python REPL. To run code, wrap it in a ```repl``` block:

 ```repl
 x = 2 + 2
 print(x)
 ```

 You will see the output (stdout and stderr) from each code block. Variables \
 persist between blocks.

 Use the REPL to compute your answer rather than guessing.\
 """


 def extract_repl_block(text):
    """Extract code from the first ```repl``` fenced block, or None."""
    m = re.search(r"```repl\n(.*?)```", text, re.DOTALL)
    return m.group(1) if m else None


 def execute_code(code, namespace):
    """
    Execute code in the given namespace.
    Returns (stdout, stderr).
    """
    old_stdout, old_stderr = sys.stdout, sys.stderr
    sys.stdout, sys.stderr = StringIO(), StringIO()
    try:
        exec(code, namespace)
    except Exception:
        traceback.print_exc(file=sys.stderr)
    finally:
        stdout = sys.stdout.getvalue()
        stderr = sys.stderr.getvalue()
        sys.stdout, sys.stderr = old_stdout, old_stderr

    return stdout, stderr
  
  def rlm(prompt, base_url="http://localhost:8080/v1", max_iterations=10):
    """
    Run the RLM loop.

    Args:
        prompt: The user's question or task.
        base_url: The mlx_lm.server endpoint.
        max_iterations: Max REPL turns before giving up.

    Returns:
        The last assistant response.
    """
    client = OpenAI(base_url=base_url, api_key="not-needed")

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]

    namespace = {"__builtins__": __builtins__}

    for i in range(max_iterations):
        response = client.chat.completions.create(
            messages=messages,
            max_tokens=2048,
            temperature=0.0,
            model="mlx-community/Qwen3-Coder-Next-6bit"
        )

        assistant_text = response.choices[0].message.content
        messages.append({"role": "assistant", "content": assistant_text})

        print(f"\n--- Iteration {i + 1} ---")
        print(assistant_text)

        block = extract_repl_block(assistant_text)
        if not block:
            return assistant_text

        stdout, stderr = execute_code(block, namespace)

        result = ""
        if stdout:
            result += stdout
        if stderr:
            result += f"[stderr]\n{stderr}"
        if not result:
            result = "(no output)"

        messages.append({"role": "user", "content": result})
        print(f"\nREPL output:\n{result}")

    return assistant_text


 if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="LLM with a Python REPL")
    parser.add_argument("prompt", help="The prompt to send to the model")
    parser.add_argument(
        "--base-url",
        default="http://localhost:8080/v1",
        help="mlx_lm.server base URL (default: http://localhost:8080/v1)",
    )
    parser.add_argument(
        "--max-iterations",
        type=int,
        default=10,
        help="Max REPL iterations (default: 10)",
    )
    args = parser.parse_args()

    result = rlm(args.prompt, base_url=args.base_url, max_iterations=args.max_iterations)
	"""
	An LM with a REPL

	Gives an LLM a Python REPL: the model can write ```repl``` code blocks,
	which get executed, with stdout/stderr fed back into the conversation.

	Requires a running mlx_lm.server:
	mlx_lm.server
	"""

	import re
	import sys
	import traceback
	from io import StringIO

	from openai import OpenAI

	SYSTEM_PROMPT = """\
	You have access to a Python REPL. To run code, wrap it in a ```repl``` block:

	```repl
	x = 2 + 2
	print(x)
	```

	You will see the output (stdout and stderr) from each code block. Variables \
	persist between blocks.

	Use the REPL to compute your answer rather than guessing.\
	"""


	def extract_repl_block(text):
	"""Extract code from the first ```repl``` fenced block, or None."""
	m = re.search(r"```repl\n(.*?)```", text, re.DOTALL)
	return m.group(1) if m else None


	def execute_code(code, namespace):
	"""
	Execute code in the given namespace.
	Returns (stdout, stderr).
	"""
	old_stdout, old_stderr = sys.stdout, sys.stderr
	sys.stdout, sys.stderr = StringIO(), StringIO()
	try:
	exec(code, namespace)
	except Exception:
	traceback.print_exc(file=sys.stderr)
	finally:
	stdout = sys.stdout.getvalue()
	stderr = sys.stderr.getvalue()
	sys.stdout, sys.stderr = old_stdout, old_stderr

	return stdout, stderr

	def rlm(prompt, base_url="http://localhost:8080/v1", max_iterations=10):
	"""
	Run the RLM loop.

	Args:
	prompt: The user's question or task.
	base_url: The mlx_lm.server endpoint.
	max_iterations: Max REPL turns before giving up.

	Returns:
	The last assistant response.
	"""
	client = OpenAI(base_url=base_url, api_key="not-needed")

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	]

	namespace = {"__builtins__": __builtins__}

	for i in range(max_iterations):
	response = client.chat.completions.create(
	messages=messages,
	max_tokens=2048,
	temperature=0.0,
	model="mlx-community/Qwen3-Coder-Next-6bit"
	)

	assistant_text = response.choices[0].message.content
	messages.append({"role": "assistant", "content": assistant_text})

	print(f"\n--- Iteration {i + 1} ---")
	print(assistant_text)

	block = extract_repl_block(assistant_text)
	if not block:
	return assistant_text

	stdout, stderr = execute_code(block, namespace)

	result = ""
	if stdout:
	result += stdout
	if stderr:
	result += f"[stderr]\n{stderr}"
	if not result:
	result = "(no output)"

	messages.append({"role": "user", "content": result})
	print(f"\nREPL output:\n{result}")

	return assistant_text


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="LLM with a Python REPL")
	parser.add_argument("prompt", help="The prompt to send to the model")
	parser.add_argument(
	"--base-url",
	default="http://localhost:8080/v1",
	help="mlx_lm.server base URL (default: http://localhost:8080/v1)",
	)
	parser.add_argument(
	"--max-iterations",
	type=int,
	default=10,
	help="Max REPL iterations (default: 10)",
	)
	args = parser.parse_args()

	result = rlm(args.prompt, base_url=args.base_url, max_iterations=args.max_iterations)
No results found