Persistent RCON connections — fixes server crash from connection spam
Root cause: self-play opened/closed a new TCP socket for every RCON command (hundreds/minute). Paper's RCON listener creates a thread per connection, overwhelming the server until it stopped. Fix: PersistentRCON class maintains a single connection per server with auto-reconnect. Thread-safe via lock. Connection pool keyed by host:port. Applied to: - mc_aigod_paper.py (prod paper-ai + dev) - mc_aigod.py (shrink-world) - self_play.py (training data generation) - persistent_rcon.py (shared module) Before: ~100+ RCON connections/minute → server crash After: 3 persistent connections total → stable Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Persistent RCON connection with auto-reconnect.
|
||||
|
||||
Replaces the pattern of opening/closing a socket per command.
|
||||
Thread-safe — one instance per server, shared across threads.
|
||||
|
||||
Usage:
|
||||
rcon = PersistentRCON("localhost", 25577, "password")
|
||||
result = rcon.command("list")
|
||||
result = rcon.command("give player minecraft:diamond 1")
|
||||
# Connection stays open, auto-reconnects on failure
|
||||
"""
|
||||
|
||||
import socket
|
||||
import struct
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PersistentRCON:
|
||||
def __init__(self, host: str, port: int, password: str, timeout: float = 10.0):
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.password = password
|
||||
self.timeout = timeout
|
||||
self._sock = None
|
||||
self._lock = threading.Lock()
|
||||
self._req_id = 0
|
||||
self._connected = False
|
||||
|
||||
def _connect(self):
|
||||
"""Establish connection and authenticate."""
|
||||
try:
|
||||
if self._sock:
|
||||
try:
|
||||
self._sock.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
self._sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
self._sock.settimeout(self.timeout)
|
||||
self._sock.connect((self.host, self.port))
|
||||
|
||||
# Authenticate
|
||||
self._send_packet(1, 3, self.password)
|
||||
resp = self._recv_packet()
|
||||
if resp is None:
|
||||
raise ConnectionError("Auth failed — no response")
|
||||
|
||||
self._connected = True
|
||||
log.debug(f"RCON connected to {self.host}:{self.port}")
|
||||
except Exception as e:
|
||||
self._connected = False
|
||||
self._sock = None
|
||||
raise ConnectionError(f"RCON connect failed: {e}")
|
||||
|
||||
def _send_packet(self, req_id: int, ptype: int, payload: str):
|
||||
data = struct.pack("<ii", req_id, ptype) + payload.encode("utf-8") + b"\x00\x00"
|
||||
self._sock.sendall(struct.pack("<i", len(data)) + data)
|
||||
|
||||
def _recv_packet(self) -> str:
|
||||
try:
|
||||
raw_len = self._sock.recv(4)
|
||||
if len(raw_len) < 4:
|
||||
return None
|
||||
length = struct.unpack("<i", raw_len)[0]
|
||||
data = b""
|
||||
while len(data) < length:
|
||||
chunk = self._sock.recv(length - len(data))
|
||||
if not chunk:
|
||||
return None
|
||||
data += chunk
|
||||
return data[8:-2].decode("utf-8", errors="replace")
|
||||
except (socket.timeout, ConnectionResetError, BrokenPipeError, OSError):
|
||||
return None
|
||||
|
||||
def command(self, cmd: str) -> str:
|
||||
"""Execute an RCON command. Auto-reconnects on failure."""
|
||||
with self._lock:
|
||||
# Try twice — first attempt, then reconnect and retry
|
||||
for attempt in range(2):
|
||||
try:
|
||||
if not self._connected or self._sock is None:
|
||||
self._connect()
|
||||
|
||||
self._req_id += 1
|
||||
self._send_packet(self._req_id, 2, cmd)
|
||||
result = self._recv_packet()
|
||||
|
||||
if result is None:
|
||||
# Connection dropped
|
||||
self._connected = False
|
||||
if attempt == 0:
|
||||
log.debug(f"RCON connection lost, reconnecting...")
|
||||
continue
|
||||
return ""
|
||||
|
||||
return result
|
||||
|
||||
except (ConnectionError, OSError, BrokenPipeError, socket.timeout) as e:
|
||||
self._connected = False
|
||||
self._sock = None
|
||||
if attempt == 0:
|
||||
log.debug(f"RCON error ({e}), reconnecting...")
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
log.warning(f"RCON failed after retry: {e}")
|
||||
return f"ERROR: {e}"
|
||||
|
||||
return ""
|
||||
|
||||
def close(self):
|
||||
with self._lock:
|
||||
if self._sock:
|
||||
try:
|
||||
self._sock.close()
|
||||
except:
|
||||
pass
|
||||
self._sock = None
|
||||
self._connected = False
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
|
||||
# --- Global connection pool ---
|
||||
_pool = {}
|
||||
_pool_lock = threading.Lock()
|
||||
|
||||
|
||||
def get_rcon(host: str, port: int, password: str) -> PersistentRCON:
|
||||
"""Get or create a persistent RCON connection."""
|
||||
key = f"{host}:{port}"
|
||||
with _pool_lock:
|
||||
if key not in _pool:
|
||||
_pool[key] = PersistentRCON(host, port, password)
|
||||
return _pool[key]
|
||||
|
||||
|
||||
def rcon(cmd: str, host: str, port: int, password: str) -> str:
|
||||
"""Drop-in replacement for the old rcon() function.
|
||||
Uses persistent connections under the hood."""
|
||||
conn = get_rcon(host, port, password)
|
||||
return conn.command(cmd)
|
||||
Reference in New Issue
Block a user