{
  "@context": {
    "@vocab": "http://schema.org/",
    "dcterms": "http://purl.org/dc/terms/",
    "ex": "https://example.org/vocab#"
  },
  "@type": "ScholarlyArticle",
  "headline": "Evolution Strategies at the Hyperscale",
  "author": [
    {
      "@type": "Person",
      "givenName": "Bidipta",
      "familyName": "Sarkar",
      "affiliation": [
        {"@id": "ex:FLAIR"},
        {"@id": "ex:WhiRL"}
      ],
      "email": "bidipta.sarkar@eng.ox.ac.uk"
    },
    {
      "@type": "Person",
      "givenName": "Mattie",
      "familyName": "Fellows",
      "affiliation": {"@id": "ex:FLAIR"},
      "email": "matthew.fellows@eng.ox.ac.uk"
    },
    {
      "@type": "Person",
      "givenName": "Juan Agustin",
      "familyName": "Duque",
      "affiliation": [
        {"@id": "ex:WhiRL"},
        {"@id": "ex:MILA"}
      ],
      "email": "juan.duque@mila.quebec"
    },
    {
      "@type": "Person",
      "givenName": "Shimon",
      "familyName": "Whiteson",
      "affiliation": [
        {"@id": "ex:WhiRL"},
        {"@id": "ex:FLAIR"}
      ],
      "email": "shimon.whiteson@cs.ox.ac.uk"
    },
    {
      "@type": "Organization",
      "@id": "ex:FLAIR",
      "name": "FLAIR - University of Oxford"
    },
    {
      "@type": "Organization",
      "@id": "ex:WhiRL",
      "name": "WhiRL - University of Oxford"
    },
    {
      "@type": "Organization",
      "@id": "ex:MILA",
      "name": "MILA– Québec AI Institute"
    },
    {
      "@type": "Organization",
      "@id": "ex:NVIDIA",
      "name": "NVIDIA AI Technology Center"
    },
    {
      "@type": "Organization",
      "@id": "ex:CIFAR",
      "name": "CIFAR AI Chair"
    },
    {
      "@type": "Organization",
      "@id": "ex:NormaCore",
      "name": "NormaCore.dev"
    }
  ],
  "datePublished": "2025",
  "abstract": "Evolution Strategies (ES) are powerful black-box optimisation methods that scale poorly on GPUs due to low arithmetic intensity. We introduce EGGROLL, a low-rank ES algorithm that improves training speed by 100x for billion-parameter models, achieving up to 91% of pure batch inference throughput. We provide theoretical analysis proving convergence and linearisation in high dimensions. Experiments show EGGROLL enables stable pretraining of integer RNN language models, competitive LLM fine-tuning on reasoning tasks, and efficient tabula rasa RL training without performance loss.",
  "mainEntityOfPage": "https://eshyperscale.github.io/",
  "hasPart": [
    {
      "@type": "DefinedTermSet",
      "name": "Defined Terms in Evolution Strategies",
      "description": "Key terms used in the paper including Evolution Strategies, Gaussian ES, Low-rank Matrix Approximation, Population Distribution, Fitness, Genotype, Score Function, Matrix Gaussian Distribution, Frobenius Norm, and Sub-Gaussian Variables.",
      "hasDefinedTerm": [
        {
          "@type": "DefinedTerm",
          "name": "Evolution Strategies (ES)",
          "description": "A class of black-box optimisation methods that do not require gradients and can handle noisy or non-differentiable objectives."
        },
        {
          "@type": "DefinedTerm",
          "name": "Gaussian ES",
          "description": "An ES variant using Gaussian population distributions with mean and isotropic variance."
        },
        {
          "@type": "DefinedTerm",
          "name": "Low-rank Matrix Approximation",
          "description": "A technique to reduce memory and compute by representing perturbations as products of low-rank matrices."
        },
        {
          "@type": "DefinedTerm",
          "name": "Population Distribution",
          "description": "A parametric distribution over parameters from which mutations are sampled."
        },
        {
          "@type": "DefinedTerm",
          "name": "Fitness",
          "description": "The objective function value to be maximised in ES."
        },
        {
          "@type": "DefinedTerm",
          "name": "Genotype",
          "description": "A vector of parameters representing an individual in the population."
        },
        {
          "@type": "DefinedTerm",
          "name": "Score Function",
          "description": "The gradient of the log-probability of the population distribution with respect to its parameters."
        },
        {
          "@type": "DefinedTerm",
          "name": "Matrix Gaussian Distribution",
          "description": "A generalisation of multivariate Gaussian to matrices with row and column covariance."
        },
        {
          "@type": "DefinedTerm",
          "name": "Frobenius Norm",
          "description": "A matrix norm equivalent to the Euclidean norm of the vectorised matrix."
        },
        {
          "@type": "DefinedTerm",
          "name": "Sub-Gaussian Variables",
          "description": "Random variables with tails that decay at least as fast as Gaussian tails."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to implement EGGROLL",
      "description": "Steps to implement the EGGROLL low-rank evolution strategy algorithm.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Sample low-rank perturbations",
          "text": "For each worker, sample matrices A and B with dimensions m×r and n×r respectively, with elements drawn i.i.d. from a zero-mean symmetric distribution."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Form rank-r perturbation",
          "text": "Compute the perturbation matrix E as E = (1/√r) * A * B^T."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Evaluate fitness",
          "text": "Evaluate the fitness function f at the perturbed parameter matrix M + σE."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Share fitness values",
          "text": "Workers share scalar fitness values with each other."
        },
        {
          "@type": "HowToStep",
          "position": 5,
          "name": "Reconstruct perturbations",
          "text": "Each worker reconstructs all perturbations from known random seeds."
        },
        {
          "@type": "HowToStep",
          "position": 6,
          "name": "Update parameters",
          "text": "Update the mean parameter matrix M by adding the weighted average of perturbations scaled by their fitness values."
        }
      ]
    },
    {
      "@type": "Question",
      "name": "What is the main advantage of EGGROLL over naive ES?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "EGGROLL improves arithmetic intensity by structuring perturbations as low-rank matrices, enabling a hundredfold increase in training speed for billion-parameter models at large population sizes, while maintaining comparable performance."
      }
    },
    {
      "@type": "Question",
      "name": "How does EGGROLL handle noise generation efficiently?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "EGGROLL uses a counter-based deterministic random number generator to reconstruct noise on demand, avoiding the need to store large perturbation matrices in memory."
      }
    },
    {
      "@type": "Question",
      "name": "What theoretical guarantees does the paper provide for EGGROLL?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The paper proves that EGGROLL updates converge to the full-rank Gaussian ES updates at a fast rate O(r^{-1}) as rank r increases, and that both EGGROLL and Gaussian ES converge to a linearised form consistent with the neural tangent kernel regime as parameter dimension grows."
      }
    },
    {
      "@type": "Question",
      "name": "Can EGGROLL be used for integer-only training?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Yes, EGGROLL enables stable pretraining of nonlinear recurrent language models that operate purely in integer datatypes, leveraging the large population sizes enabled by the method."
      }
    },
    {
      "@type": "Question",
      "name": "How does EGGROLL perform on reinforcement learning tasks?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "EGGROLL is competitive with naive ES in tabula rasa and multi-agent RL settings, often delivering substantial wall-clock improvements due to its batched low-rank structure."
      }
    },
    {
      "@type": "Question",
      "name": "What are the main experimental domains evaluated?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "The paper evaluates EGGROLL on pure integer language model pretraining, reinforcement learning tasks including multi-agent environments, and fine-tuning of large language models on reasoning tasks."
      }
    },
    {
      "@type": "Question",
      "name": "What is the arithmetic intensity of EGGROLL compared to naive ES?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "EGGROLL achieves much higher arithmetic intensity by batching low-rank perturbations, enabling it to saturate GPU compute with many unique perturbations per input, unlike naive ES which has low arithmetic intensity and limited scalability."
      }
    },
    {
      "@type": "Question",
      "name": "How does EGGROLL handle distributed training efficiently?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "EGGROLL uses a lightweight distributed framework with a Coordinator-Worker topology, base-3 fitness packing to reduce communication bandwidth, sharding-aware updates, layer-wise memory management, and direct GPU-to-GPU weight synchronization."
      }
    },
    {
      "@type": "Question",
      "name": "What is the role of low-rank perturbations in EGGROLL?",
      "acceptedAnswer": {
        "@type": "Answer",
        "text": "Low-rank perturbations reduce memory and compute costs by representing perturbations as products of smaller matrices, enabling efficient batched matrix multiplications with higher arithmetic intensity."
      }
    }
  ],
  "hasPart": [
    {
      "@type": "HowTo",
      "name": "How to pretrain an integer RNN language model with EGGROLL",
      "description": "Steps to pretrain a nonlinear recurrent neural network language model purely in integer datatypes using EGGROLL.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Design EGG architecture",
          "text": "Use a nonlinear recurrent neural network with all weights and activations in int8 datatype, no explicit activation functions, relying on implicit nonlinearity from int8 clipping."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Initialize parameters",
          "text": "Initialize weights as 16 times standard normal samples rounded to int8."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Apply EGGROLL perturbations",
          "text": "Sample rank-1 perturbation vectors A and B in int8, apply perturbations efficiently using batched int8 vector-matrix multiplications with int32 accumulation."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Calculate fitness",
          "text": "Calculate log-likelihood loss using integer lookup tables for EXP2 and LOG2."
        },
        {
          "@type": "HowToStep",
          "position": 5,
          "name": "Update parameters",
          "text": "Update parameters by discrete unit steps based on fitness-weighted perturbations with a threshold controlling update sparsity."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to fine-tune large transformer LLMs with EGGROLL",
      "description": "Steps and infrastructure details for fine-tuning large transformer language models using EGGROLL.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Use vLLM inference engine",
          "text": "Leverage vLLM's high-throughput kernel implementations and native multi-LoRA serving with tensor parallelism."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Implement custom WorkerExtension",
          "text": "Convert vLLM inference engine into training-capable runtime with sharding-aware updates."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Perform layer-wise memory management",
          "text": "Apply ES weight updates in a streaming fashion to reduce memory overhead."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Use direct GPU-to-GPU synchronization",
          "text": "Broadcast updated parameters using NCCL to avoid CPU bottlenecks."
        },
        {
          "@type": "HowToStep",
          "position": 5,
          "name": "Apply meta-device blueprinting",
          "text": "Instantiate meta-model to derive weight shapes and sharding without allocating full tensors."
        }
      ]
    },
    {
      "@type": "HowTo",
      "name": "How to fine-tune a time series foundation model for high-frequency trading with EGGROLL",
      "description": "Steps to fine-tune a pretrained S5 model on limit order book data for order execution using EGGROLL.",
      "step": [
        {
          "@type": "HowToStep",
          "position": 1,
          "name": "Pretrain S5 model",
          "text": "Pretrain on tokenised limit order book messages using cross-entropy loss."
        },
        {
          "@type": "HowToStep",
          "position": 2,
          "name": "Setup fine-tuning task",
          "text": "Define order execution task to sell a fixed quantity within a time horizon, using Jax-LOB simulator."
        },
        {
          "@type": "HowToStep",
          "position": 3,
          "name": "Apply LoRA adapters",
          "text": "Apply LoRA with rank 4 on projection matrices, freezing SSM parameters and layer norms."
        },
        {
          "@type": "HowToStep",
          "position": 4,
          "name": "Define fitness",
          "text": "Use rank-based transformation of realised profit and loss (PnL) as fitness."
        },
        {
          "@type": "HowToStep",
          "position": 5,
          "name": "Train with EGGROLL",
          "text": "Fine-tune model with EGGROLL, observing improved mean PnL and reduced variance over training."
        }
      ]
    }
  ],
  "image": [
    {
      "@type": "ImageObject",
      "contentUrl": "page_1.png",
      "caption": "Figure 1: Schematic visualisation of EGGROLL using N workers (page 1)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_2.png",
      "caption": "Figure 2: (a) Relative speed of EGGROLL vs prior methods; (b) Pure integer pretraining test loss curves (page 2)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_4.png",
      "caption": "Mathematical equations and matrix Gaussian distribution definitions (page 4)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_11.png",
      "caption": "Figure 3: Marginal score multiplied by density for increasing rank r, showing fast convergence to Gaussian (page 11)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_12.png",
      "caption": "Figure 4: (a) RL returns normalized by PPO; (b) Validation score on Countdown task with RWKV 7g1.5B (page 12)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_13.png",
      "caption": "Figure 5: Validation score comparison on GSM8K and math reasoning tasks (page 13)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_14.png",
      "caption": "Figure 6: Perplexity and validation score during quantised distillation of RWKV 7g7B (page 14)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_15.png",
      "caption": "References and acknowledgements (page 15)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_27.png",
      "caption": "Lemma 1 proof and Gaussian concentration inequality (page 27)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_31.png",
      "caption": "Theorem 1 proof bounding ES update convergence rate (page 31)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_39.png",
      "caption": "Theorem 3 proof on EGGROLL convergence to linearity (page 39)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_44.png",
      "caption": "Edgeworth expansion and asymptotic rank analysis (page 44)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_52.png",
      "caption": "Figure 7: Relative speed of EGGROLL including noise regeneration (page 52)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_54.png",
      "caption": "Arithmetic intensity analysis for standard inference, Gaussian ES, and EGGROLL (page 54)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_58.png",
      "caption": "EGG integer training architecture and parameter update details (page 58)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_59.png",
      "caption": "Ablation study on data batch size and population size for integer pretraining (page 59)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_65.png",
      "caption": "Figure 11: Training curves for high-frequency trading fine-tuning with EGGROLL (page 65)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_66.png",
      "caption": "Figure 12: Multi-agent RL training curves and wall-clock times (page 66)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_70.png",
      "caption": "Figure 14: Reinforcement learning results across 16 environments (page 70)."
    },
    {
      "@type": "ImageObject",
      "contentUrl": "page_72.png",
      "caption": "Figure 15: Training time comparison between EGGROLL and OpenES (page 72)."
    }
  ],
  "publisher": {
    "@type": "Organization",
    "name": "University of Oxford, MILA, NVIDIA AI Technology Center, NormaCore.dev",
    "url": "https://eshyperscale.github.io/"
  },
  "author": [
    {
      "@type": "Person",
      "name": "Bidipta Sarkar"
    },
    {
      "@type": "Person",
      "name": "Mattie Fellows"
    },
    {
      "@type": "Person",
      "name": "Juan Agustin Duque"
    },
    {
      "@type": "Person",
      "name": "Alistair Letcher"
    },
    {
      "@type": "Person",
      "name": "Antonio León Villares"
    },
    {
      "@type": "Person",
      "name": "Anya Sims"
    },
    {
      "@type": "Person",
      "name": "Clarisse Wibault"
    },
    {
      "@type": "Person",
      "name": "Dmitry Samsonov"
    },
    {
      "@type": "Person",
      "name": "Dylan Cope"
    },
    {
      "@type": "Person",
      "name": "Jarek Liesen"
    },
    {
      "@type": "Person",
      "name": "Kang Li"
    },
    {
      "@type": "Person",
      "name": "Lukas Seier"
    },
    {
      "@type": "Person",
      "name": "Theo Wolf"
    },
    {
      "@type": "Person",
      "name": "Uljad Berdica"
    },
    {
      "@type": "Person",
      "name": "Valentin Mohl"
    },
    {
      "@type": "Person",
      "name": "Alexander David Goldie"
    },
    {
      "@type": "Person",
      "name": "Aaron Courville"
    },
    {
      "@type": "Person",
      "name": "Karin Sevegnani"
    },
    {
      "@type": "Person",
      "name": "Shimon Whiteson"
    },
    {
      "@type": "Person",
      "name": "Jakob Nicolaus Foerster"
    }
  ],
  "articleBody": "Evolution Strategies (ES) are powerful black-box optimisation methods that scale poorly on GPUs due to low arithmetic intensity. We introduce EGGROLL, a low-rank ES algorithm that improves training speed by 100x for billion-parameter models, achieving up to 91% of pure batch inference throughput. We provide theoretical analysis proving convergence and linearisation in high dimensions. Experiments show EGGROLL enables stable pretraining of integer RNN language models, competitive LLM fine-tuning on reasoning tasks, and efficient tabula rasa RL training without performance loss."
}