\RequirePackage{rotating}
\documentclass{mcom-l}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}
\newtheorem{xca}[theorem]{Exercise}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newcommand{\abs}[1]{\lvert#1\rvert}
\newcommand{\HD}{H\"{o}rmann--Derflinger\xspace}
\numberwithin{equation}{section}
\usepackage{booktabs}
\usepackage{pdfsync}
\usepackage{verbatim}
\usepackage{graphicx}
\usepackage{xspace}
\usepackage{mathrsfs}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{booktabs}
\usepackage{bm}
\usepackage{url}
\def\..{\,\mathpunct{\ldotp\ldotp}} % Middle stuff for intervals. Usage: \..
\setlength{\fboxrule}{0.2mm}
\setlength{\fboxsep}{2mm}
\newcommand{\frameit}[2]{
\begin{center}
{\color{red} \framebox[0.4\textwidth][l]{
\begin{minipage}{0.35\textwidth}
{\color{red}\bf #1}: {\color{black}#2}
\end{minipage}
}\\
}
\end{center}
}
\newcommand{\sing}[1]{\left\{\?#1\?\right\}}
\newcommand{\Z}{\mathbf Z}
\newcommand{\lst}[2]{${#1}_0$,~${#1}_1$, $\dots\,$,~${#1}_{#2-1}$}
\newcommand{\lstt}[2]{${#1}_0$,~${#1}_1$, $\dots\,$,~${#1}_{#2-2}$}
\newcommand{\xorshift}[1][]{\texttt{xorshift#1}\xspace}
\newcommand{\xoroshiro}[1][]{\texttt{xoroshiro#1}\xspace}
\newcommand{\xoroshirop}[1][]{\texttt{xoroshiro#1+}\xspace}
\newcommand{\xoroshiropp}[1][]{\texttt{xoroshiro#1++}\xspace}
\newcommand{\xoroshiros}[1][]{\texttt{xoroshiro#1*}\xspace}
\newcommand{\xoroshiross}[1][]{\texttt{xoroshiro#1**}\xspace}
\newcommand{\xorshifts}[1][]{\texttt{xorshift#1*}\xspace}
\newcommand{\xorshiftp}[1][]{\texttt{xorshift#1+}\xspace}
\newcommand{\xoshiro}[1][]{\texttt{xoshiro#1}\xspace}
\newcommand{\xoshirop}[1][]{\texttt{xoshiro#1+}\xspace}
\newcommand{\xoshiropp}[1][]{\texttt{xoshiro#1++}\xspace}
\newcommand{\xoshiros}[1][]{\texttt{xoshiro#1*}\xspace}
\newcommand{\xoshiross}[1][]{\texttt{xoshiro#1**}\xspace}
\newcommand{\wella}{\texttt{WELL1024a}\xspace}
\newcommand{\wellb}{\texttt{WELL19937a}\xspace}
\newcommand{\Zplus}{\dotplus}
\newcommand{\mt}[1][]{\texttt{MT19937}\xspace}
\newcommand{\xorgens}[1][]{\texttt{xorgens#1}\xspace}
\newcommand{\xsadd}{\texttt{XSadd}\xspace}
\newcommand{\xst}[3]{#1, #2, #3}
\newcommand{\xs}[4]{$A_{#1}(#2, #3, #4)$}
\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\Det}{\operatorname{Det}}
\newcommand{\Detr}{\operatorname{\textrm{Det}^{\textrm{r}}}}
\newcommand{\Detc}{\operatorname{\textrm{Det}^{\textrm{c}}}}
\newcommand{\?}{\mskip1.5mu}
\begin{document}
\bibliographystyle{amsplain}
\title[Computationally easy, spectrally good multipliers]{Computationally easy, spectrally good multipliers for congruential pseudorandom number generators}
\author{Guy Steele}
\address{Oracle Labs}
\email{guy.steele@oracle.com}
\author{Sebastiano Vigna}
\address{Universit\`a degli Studi di Milano, Italy}
\email{sebastiano.vigna@unimi.it}
\begin{abstract}
Congruential pseudorandom number generators rely on good \emph{multipliers}, that is, integers
that have good performance with respect to the spectral test. We provide lists
of multipliers with a good lattice structure up to dimension eight
for generators with typical power-of-two moduli, analyzing in detail
multipliers close to the square root of the modulus, whose product can be
computed quickly.
\end{abstract}
\maketitle
\section{Introduction}
A \emph{multiplicative congruential pseudorandom number generator} (MCG) is a computational process
defined by a recurrence of the form
\[
x_n = \bigl(a x_{n-1}\bigr)\bmod m,
\]
where $m\in\Z$ is the \emph{modulus}, $a\in\Z/m\Z$ is the \emph{multiplier}, and
$x_n\in(\Z/m\Z)\raise0.15ex\hbox{$\smallsetminus$}\{0\}$ is the \emph{state} of the generator after step $n$. Such
pseudorandom number generators (PRNGs) were introduced by
Lehmer~\cite{LehMMLSCU}, and have been extensively studied. By adding a constant $c\in\Z/m\Z$, $c\neq 0$, we obtain
a \emph{linear congruential pseudorandom number generator} (LCG), with state $x_n\in \Z/m\Z$:\footnote{We remark that these denominations, by now used for half a century, are completely wrong from a mathematical
viewpoint. The map $x\mapsto ax$ is indeed a \emph{linear} map, but the map $x\mapsto ax+c$ is an \emph{affine} map~\cite{BouEMA13}:
what we call an ``MCG'' or ``MLCG'' should called an ``LCG'' and what we call an ``LCG'' should be called an ``ACG''. The mistake originated probably
in the interest of Lehmer in (truly) linear maps with prime moduli~\cite{LehMMLSCU}. Constants were added later to obtain long-period
generators with non-prime moduli, but the ``linear'' name stuck (albeit some authors are using the term ``mixed'' instead of ``linear''). At this point
it is unlikely that the now-traditional names will be corrected.}
\[
x_n = \bigl(a x_{n-1} + c\bigr)\bmod m.
\]
Under suitable conditions on $m$, $a$ and $c$, sequences of this
kind are periodic and their period is \emph{full}, that is, $m-1$ for MCGs ($c=0$) and $m$
for LCGs ($c\neq 0$). For MCGs, $m$ must be prime and $a$ must be a \emph{primitive element}
of the multiplicative group of residue classes $(\Z/m\Z)^\times$ (i.e., its powers must span the whole group). For LCGs, there are simple conditions that
must be satisfied~\cite[\S 3.2.1.2, Theorem A]{KnuACPII}.
For MCGs, when $m$ is not prime one can look for sequences that have \emph{maximum period}, that is, the longest possible period, given $m$.
We will be interested in moduli that are powers of two, in which case, if $m\geq 8$,
the maximum period is $m/4$, and the state must be odd~\cite[\S 3.2.1.2, Theorem B]{KnuACPII}.
While MCGs and LCGs have some known defects, they can be used in combination
with other pseudorandom number generators (PRNGs), or passed through some bijective function that might lessen such
defects. Due to their speed and simplicity,
as well as a substantial accrued body of mathematical analysis,
they have been for a long time the PRNGs of choice in programming languages.
In this paper, we provide lists of multipliers for both MCGs and LCGs,
continuing the line of work by L'Ecuyer in his classic paper~\cite{LEcTLCGDSGLS}.
The quality of such multipliers is usually assessed by their score in the
\emph{spectral test}, described below.
The search for good multipliers is a sampling process from a large space: due to
the enormous increase in computational power in the last twenty years, we can now provide
multipliers with significantly improved scores. In fact, for multipliers of up to $35$ bits we have now explored
the sample space exhaustively.
We consider only generators with power-of-two moduli; this choice
avoids the expensive modulo operation, because nearly all contemporary
hardware supports binary arithmetic that is naturally carried modulo $2^w$ for some \emph{word size} $w$.
Such generators do have additional known, specific defects (e.g., the periods of the lowest bits are very short,
and the flip of a state bit will never propagate to lower bits), but there is a substantial body of literature
on how to ameliorate or avoid these defects.
Furthermore, in this paper we pay special attention to \emph{small} multipliers, that is, multipliers
close to the square root of the modulus $m$. For $m=2^{2w}$, this means
multipliers whose size in bits is $w\pm k$ for small $k$. As is well known, many CPUs with natural
word size $w$ can produce with a single instruction, or two instructions, the full $2w$-bit product of two $w$-bit
operands, which makes such multipliers attractive from a computational viewpoint.
Unfortunately, such small multipliers have known additional defects, which have been analyzed by
H\"{o}rmann and Derflinger~\cite{HoDPRNGWSRM}, who provided experimental
evidence of their undesirable behavior using a statistical test based on
rejection.
One of the goals of this paper is to deepen their analysis: we
first prove theoretically that $w$-bit multipliers for LCGs with power-of-two
modulus $2^{2w}$ have inherent theoretical
defects. Then we show that these defects are ameliorated as we add bits to
the multiplier, and we quantify this improvement by defining a new
\emph{figure of merit} based on the magnitude on the multiplier. In the end, we provide tables of multipliers of $w+k$
bits, where $k$ is relatively small, with quality closer to that of full $2w$-bit multipliers.
During the search of good multipliers, the authors have accumulated a large database
of candidates, which is publicly available for download, in case the reader is
interested in looking for multipliers with specific properties. The software used to search
for multipliers is available under the [choose license].
\section{Spectral figures of merit}
For every integer $d\geq 2$, the \emph{dimension}, we can consider the set of
$d$-dimensional points in the unit cube
\[
\Lambda_d = \Biggl\{\Biggl(\frac xm, \frac{f(x)}m, \frac{f^2(x)}m, \ldots, \frac{f^{d-1}(x)}m\Biggr) \Biggm| x\in \Z/m\Z\Biggr\},
\]
where \[f(x)=(ax+c) \bmod m\] is the next-state map of a full-period generator.
This set is the intersection of a $d$-dimensional \emph{lattice} with the unit cube~\cite[\S 3.3.4.A]{KnuACPII}. Thus,
all points in $\Lambda_d$ lie on a family of equidistant, parallel \emph{hyperplanes}; in fact, there
are many such families.
The \emph{spectral test} examines the family with the largest distance between
adjacent hyperplanes:
the smaller this \emph{largest interplane distance} is, the more evenly the generator fills the unit $d$-dimensional cube.
Using this idea, the \emph{figure of merit}
for dimension $d$ of an MCG or LCG is defined as
\[
f_d(m,a) =\frac{\nu_d}{\gamma_d^{1/2}\sqrt[d]{m}},
\]
where $1/\nu_d$ is the largest distance between adjacent hyperplanes found by considering all possible families of hyperplanes covering $\Lambda_d$.
We will usually imply the dependency on the choice of $m$ and $a$.
The definition of $f_d$ also relies on the \emph{Hermite constant} $\gamma_d$ for dimension $d$. For $2\leq d \leq 8$, the Hermite constant has these values:
\[
\gamma_2=(4/3)^{1/2}, \gamma_3=2^{1/3}, \gamma_4=2^{1/2}, \gamma_5=2^{3/5}, \gamma_6=(64/3)^{1/6}, \gamma_7=4^{3/7}, \gamma_8=2.
\]
For all higher dimensions except $d=24$ only upper and lower bounds are known. Note that $1/\bigl(\gamma_d^{1/2}\sqrt[d]{m}\bigr)$ is
the smallest possible such largest interplane distance \hbox{\cite[\S 3.3.4.E, equation (40)]{KnuACPII}}; it follows that $0 < f_d\leq 1$.
The reason for expressing the largest interplane distance in the form of a reciprocal $1/\nu_d$ is that $\nu_d$ is the \emph{length of the shortest
vector in the dual lattice $\Lambda^*_d$}. The dual lattice consists of all vectors whose scalar product with every vector
of the original lattice is an integer. In particular, it has the following basis~\cite[\S 3.3.4.C]{KnuACPII}:
\begin{align*}
(m, 0, 0, 0, & \ldots, 0, 0)\\
(-a, 1, 0, 0, &\ldots, 0, 0)\\
(-a^2, 0, 1, 0,& \ldots, 0, 0)\\
\vdots\;\, & \hphantom{\ldots,}\;\,\vdots \\
(-a^{d-2}, 0, 0, 0,& \ldots, 1, 0)\\
(-a^{d-1}, 0, 0, 0,& \ldots, 0, 1)
\end{align*}
That is, $\Lambda^*_d$ is formed by taking all possible linear combinations of the vectors above with integer coefficients.
Note that the constant $c$ of an LCG has no role in the structure of $\Lambda_d$ and $\Lambda^*_d$, and that
we are under a full-period assumption.
The dual lattice is somewhat easier to work with, as its points have all integer coordinates; moreover, as we
mentioned, if we call $\nu_d$ the length of its shortest vector, the maximum distance between parallel hyperplanes
covering $\Lambda_d$ is $1/\nu_d$ (and, indeed, this is how the figure of merit $f_d$ is computed).
\section{Computationally easy multipliers}
Multipliers smaller than $\sqrt m$ have been advocated, in particular when the modulus is a power of two, say $m=2^{2w}$, because they
do not require a full $2w$-bit multiplication: writing $x_\_$ and $x^\_$ for the $w$ lowest and highest bits,
respectively, of a $2w$-bit value $x$ (that is, $x_\_=x \bmod 2^w$ and $x^\_=\lfloor x/2^w\rfloor$), we have
\[
(ax)\bmod 2^{2w} = \bigl(ax_\_ + a\cdot 2^w x^\_ \bigl)\bmod 2^{2w} = \bigl(a x_\_ + 2^w\cdot a x^\_ \bigl) \bmod 2^{2w}.
\]
The first multiplication, $ax_\_$, has a $2w$-bit operand $a$ and a $w$-bit operand $x_\_$, and in general the result may be $2w$ bits wide;
but the second multiplication, $ax^\_$, can
be performed by an instruction that takes two $w$-bit operands and produces only a $w$-bit result that is only the low $w$ bits of the full product,
because the modulo operation effectively discards the high $w$ bits of that product.
Moreover, if the multiplier $a=2^w a^\_+ax^\_$ has a high part that is small (say, $a^\_ < 256$) or of a special form (for example, $a^\_ = j2^n$ where $j$ is $1$, $3$, $5$, or $9$), then the first multiplication may also be computed using a faster method.
Contemporary optimizing compilers know how to exploit such special cases, perhaps by using a small immediate operand rather than loading the entire multiplier into
a register, or perhaps by using shift instructions and/or such instructions as \texttt{lea} (Load Effective Address), which in the Intel 64-bit architecture may be used to compute $x+jy$ on two 64-bit operands $x$ and $y$ for $j$ = $2$, $4$, or $8$ \cite[p.\ 3-554]{INTEL-INSTRUCTION-SET}.
And even if the compiler produces the same code for, say, a multiplier that is $(3/2)w$ bits wide as for a multiplier that is $2w$ bits wide,
some hardware architectures may notice the smaller multiplier on the fly and handle it in a faster way.
Multiplication by a constant $a$ of size $w$, that is, of the form $a_\_$ (in other words, $a^\_=0$), is especially simple:
\[
a_\_x \bmod 2^{2w} = \bigl( a_\_ x_\_ + 2^w a_\_ x^\_ \bigr)\bmod 2^{2w} \hfil
\]
and notice that the addition can be performed as a $w$-bit addition of the low $w$ bits of $a_\_ x^\_$ into the high half of $a_\_ x_\_$.
In comparison, multiplication by a constant $a$ of size $w+1$, that is, of the form $2^w+a_\_$ (in other words, $a^\_=1$), requires only one extra addition:
\begin{multline*}
\bigl(\bigl(2^w + a_\_\bigr)x\bigr) \bmod 2^{2w} = \bigl(\bigl(2^w + a_\_\bigr)x_\_ + \bigl(2^w + a_\_\bigr)\bigl(x^\_ \cdot 2^w\bigr)\bigr) \bmod 2^{2w} =\\
\bigl( 2^wx_\_ + a_\_ x_\_ + 2^w\cdot a_\_ x^\_ \bigr)\bmod 2^{2w} =
\bigl( a_\_ x_\_ + 2^w\cdot (x_\_ + a_\_ x^\_ )\bigr)\bmod 2^{2w}.
\end{multline*}
Modern compilers know the reduction above and will reduce the strength of operations involved as necessary.
Even without the help of the compiler, we can push this idea further to multipliers of the form $2^{w+k}+ a$, where $k$ is a small positive integer constant:
\begin{multline*}
\bigl(\bigl(2^{w+k} + a\bigr)x\bigr) \bmod 2^{2w} = \bigl(\bigl(2^{w+k} + a\bigr)x_\_ + \bigl(2^{w+k} + a\bigr)\bigl(x^\_ \cdot 2^w\bigr)\bigr) \bmod 2^{2w} =\\
\bigl( 2^{w+k}x_\_ + a x_\_ + 2^w\cdot a x^\_ \bigr)\bmod 2^{2w} =
\bigl( a x_\_ + 2^w\cdot \bigl(2^k x_\_ + a x^\_ \bigr)\bigr)\bmod 2^{2w}.
\end{multline*}
In comparison to the $(w+1)$-bit case, we just need an additional shift to compute $2^k x_\_$.
In the interest of efficiency, it thus seems interesting to study in more detail
the quality of small multipliers.
In Figure~\ref{fig:intelcode}
we show code generated by the \texttt{clang} compiler
that uses 64-bit instructions to multiply a 128-bit value (in registers \texttt{rsi} and \texttt{rdi})
by (whimsically chosen) constants of various sizes.
The first example shows that if the constant is of size 64, indeed only two 64-bit by 64-bit multiply instructions
(one producing a 128-bit result and the other just a 64-bit result) and one 64-bit add instruction are needed.
The second example shows that if the constant is of size 65, indeed only one extra 64-bit add instruction is needed.
For constants of size 66 and above, more
sophisticated strategies emerge that use \texttt{leaq} (the quadword, that is, 64-bit form of \texttt{lea}) and shift instructions and even subtraction. In Figure~\ref{fig:armcode}
we show three examples of code generated by \texttt{clang} for the ARM processor: since its RISC architecture~\cite{ARM-INSTRUCTION-SET}
can only load constant values $16$ bits at a time, the length of the sequence of instructions grows
as the multiplier size grows. On the other hand, note that the ARM architecture has a multiply-add instruction \texttt{madd}.
\iffalse
\begin{figure}[p]
\centering
\begin{tabular}{@{}rrc@{}}
Bits&\multicolumn{1}{@{}c}{Multiplier} & Code \\[1ex]
\hline
64&\texttt{\footnotesize 0xCAFEF00DDEADF00D} &\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rax
imulq %rax, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
65&\texttt{\footnotesize 0x1CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rdi
imulq %rdi, %rsi
addq %rax, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}
\vspace*{0.2em}
\end{minipage}\\
\hline
66&\texttt{\footnotesize 0x2CAFEF00DDEADF00D}& \begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $xCAFEF00DDEADF00D, %rdi
imulq %rdi, %rsi
leaq (%rsi,%rax,2), %rcx
mulq %rdi
addq %rcx, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
% \hline
% 66&\texttt{\footnotesize 0x3CAFEF00DDEADF00D} &
% \begin{minipage}{5cm}\footnotesize%
% \vspace*{1em}
% \begin{verbatim}
% movabsq $0xCAFEF00DDEADF00D, %rdi
% imulq %rdi, %rsi
% leaq (%rax,%rax,2), %rdx
% addq %rdx, %rsi
% mulq %rdi
% addq %rsi, %rdx
%
% \end{verbatim}%
% \vspace*{0.2em}
% \end{minipage}&
% $1.100$&$1.320$\\
\hline
67&\texttt{\footnotesize 0x4CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rdi
imulq %rdi, %rsi
leaq (%rax,%rax,2), %rdx
addq %rdx, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
67&\texttt{\footnotesize 0x5CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rdi
imulq %rdi, %rsi
leaq (%rax,%rax,4), %rdx
addq %rdx, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
% \hline
% 67&\texttt{\footnotesize 0x6CAFEF00DDEADF00D} &
% \begin{minipage}{5cm}\footnotesize%
% \vspace*{0.4em}
% \begin{verbatim}
% movabsq $0xCAFEF00DDEADF00D, %rdi
% imulq $6, %rax, %rdx
% imulq %rdi, %rsi
% addq %rdx, %rsi
% mulq %rdi
% addq %rsi, %rdx
%
% \end{verbatim}%
% \vspace*{0.2em}
% \end{minipage}&
% $1.100$&$1.320$\\
\hline
67&\texttt{\footnotesize 0x7CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rdi
imulq $7, %rax, %rdx
imulq %rdi, %rsi
addq %rdx, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\vspace*{0.2em}
\end{minipage}\\
\hline
96&\texttt{\footnotesize 0xFADC0C0ACAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movl $0xFADC0C0A, %edx
movabsq $0xCAFEF00DDEADF00D, %rdi
imulq %rax, %rdx
imulq %rdi, %rsi
addq %rdx, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
128&\texttt{\footnotesize 0xAB0DE0FBADC0FFEECAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xAB0DE0FBADC0FFEE, %rdx
movabsq $0xCAFEF00DDEADF00D, %rdi
imulq %rax, %rdx
imulq %rdi, %rsi
addq %rdx, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}
\end{tabular}
\caption{\label{fig:code}\texttt{gcc}-generated Intel code for the multiplication part of a $128$-bit LCG using
multipliers of increasing size. Note the different code generated in the $66$-bit case
depending on the two most significant bits. The code generated for more than $96$ bits (not shown here) is identical to the $128$-bit case.}
\end{figure}
\fi
\begin{figure}[p]
\centering
\begin{tabular}{@{}rrc@{}}
Bits&\multicolumn{1}{@{}c}{Multiplier} & Code \\[1ex]
\hline
64&\texttt{\footnotesize 0xCAFEF00DDEADF00D} &\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rax
imulq %rax, %rsi
mulq %rdi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
65&\texttt{\footnotesize 0x1CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rcx
imulq %rcx, %rsi
mulq %rcx
addq %rdi, %rdx
addq %rsi, %rdx
\end{verbatim}
\vspace*{0.2em}
\end{minipage}\\
\hline
66&\texttt{\footnotesize 0x2CAFEF00DDEADF00D}& \begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $xCAFEF00DDEADF00D, %rcx
imulq %rcx, %rsi
mulq %rcx
leaq (%rdx,%rdi,2), %rdx
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
% \hline
% 66&\texttt{\footnotesize 0x3CAFEF00DDEADF00D} &
% \begin{minipage}{5cm}\footnotesize%
% \vspace*{1em}
% \begin{verbatim}
% movabsq $0xCAFEF00DDEADF00D, %rdi
% imulq %rdi, %rsi
% leaq (%rax,%rax,2), %rdx
% addq %rdx, %rsi
% mulq %rdi
% addq %rsi, %rdx
%
% \end{verbatim}%
% \vspace*{0.2em}
% \end{minipage}&
% $1.100$&$1.320$\\
\hline
67&\texttt{\footnotesize 0x4CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rcx
imulq %rcx, %rsi
mulq %rcx
leaq (%rdx,%rdi,4), %rdx
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
67&\texttt{\footnotesize 0x5CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %rcx
mulq %rcx
imulq %rcx, %rsi
leaq (%rdi,%rdi,4), %rcx
addq %rcx, %rdx
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
% \hline
% 67&\texttt{\footnotesize 0x6CAFEF00DDEADF00D} &
% \begin{minipage}{5cm}\footnotesize%
% \vspace*{0.4em}
% \begin{verbatim}
% movabsq $0xCAFEF00DDEADF00D, %rdi
% imulq $6, %rax, %rdx
% imulq %rdi, %rsi
% addq %rdx, %rsi
% mulq %rdi
% addq %rsi, %rdx
%
% \end{verbatim}%
% \vspace*{0.2em}
% \end{minipage}&
% $1.100$&$1.320$\\
\hline
67&\texttt{\footnotesize 0x7CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xCAFEF00DDEADF00D, %r8
mulq %r8
leaq (,%rdi,8), %rcx
subq %rdi, %rcx
addq %rcx, %rdx
imulq %r8, %rsi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\vspace*{0.2em}
\end{minipage}\\
\hline
96&\texttt{\footnotesize 0xFADC0C0ACAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movl $0xFADC0C0A, %ecx
movabsq $0xCAFEF00DDEADF00D, %r8
mulq %r8
imulq %rdi, %rcx
addq %rcx, %rdx
imulq %r8, %rsi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
128&\texttt{\footnotesize 0xAB0DE0FBADC0FFEECAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
movabsq $0xAB0DE0FBADC0FFEE, %rcx
movabsq $0xCAFEF00DDEADF00D, %r8
mulq %r8
imulq %rdi, %rcx
addq %rcx, %rdx
imulq %r8, %rsi
addq %rsi, %rdx
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}
\end{tabular}
\caption{\label{fig:intelcode}\texttt{clang}-generated Intel code for the multiplication part of a $128$-bit LCG using
multipliers of increasing size. The code generated for more than $96$ bits (not shown here) is identical to the $128$-bit case.}
\end{figure}
\begin{figure}[p]
\centering
\begin{tabular}{@{}rrc@{}}
Bits&\multicolumn{1}{@{}c}{Multiplier} & Code \\[1ex]
\hline
64&\texttt{\footnotesize 0xCAFEF00DDEADF00D} &\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
mov x8, #0xF00D
movk x8, #0xDEAD, lsl #16
movk x8, #0xF00D, lsl #32
movk x8, #0xCAFE, lsl #48
umulh x9, x0, x8
madd x1, x1, x8, x9
mul x0, x0, x8
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
65&\texttt{\footnotesize 0x1CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
mov x8, #0xF00D
movk x8, #0xDEAD, lsl #16
movk x8, #0xF00D, lsl #32
movk x8, #0xCAFE, lsl #48
umulh x9, x0, x8
add x9, x9, x0
madd x1, x1, x8, x9
mul x0, x0, x8
\end{verbatim}
\vspace*{0.2em}
\end{minipage}\\
\hline
67&\texttt{\footnotesize 0x7CAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
mov x8, #0xF00D
movk x8, #0xDEAD, lsl #16
movk x8, #0xF00D, lsl #32
movk x8, #0xCAFE, lsl #48
lsl x9, x0, #3
umulh x10, x0, x8
sub x9, x9, x0
add x9, x10, x9
madd x1, x1, x8, x9
mul x0, x0, x8
\end{verbatim}%
\vspace*{0.2em}
\vspace*{0.2em}
\end{minipage}\\
\hline
96&\texttt{\footnotesize 0xFADC0C0ACAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
mov x8, #0xF00D
movk x8, #0xDEAD, lsl #16
movk x8, #0xF00D, lsl #32
movk x8, #0xCAFE, lsl #48
mov w9, #0x0C0A
movk w9, #0xFADC, lsl #16
umulh x10, x0, x8
madd x9, x0, x9, x10
madd x1, x1, x8, x9
mul x0, x0, x8
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}\\
\hline
128&\texttt{\footnotesize 0xAB0DE0FBADC0FFEECAFEF00DDEADF00D} &
\begin{minipage}{5cm}\footnotesize%
\vspace*{0.4em}
\begin{verbatim}
mov x9, #0xF00D
mov x8, #0xFFEE
movk x9, #0xDEAD, lsl #16
movk x8, #0xADC0, lsl #16
movk x9, #0xF00D, lsl #32
movk x8, #0xE0FB, lsl #32
movk x9, #0xCAFE, lsl #48
movk x8, #0xAB0D, lsl #48
umulh x10, x0, x9
madd x8, x0, x8, x10
madd x1, x1, x9, x8
mul x0, x0, x9
\end{verbatim}%
\vspace*{0.2em}
\end{minipage}
\end{tabular}
\caption{\label{fig:armcode}\texttt{clang}-generated ARM code for the multiplication part of a $128$-bit LCG using
multipliers of increasing size. Note how the number of \texttt{mov} and \texttt{movk} instructions depends on the size of the multiplier.}
\end{figure}
\section{Bounds}
Our first result says that if the multiplier is smaller than the root of order $d$ of the modulus, there
is an upper bound to the value that the figure of merit $f_d$ can attain:
\begin{theorem}
\label{th:bound}
Consider a full-period generator with modulus $m$ and multiplier $a$.
Then, for every $d\geq2$, if $a<\sqrt[d]m$ we have $\nu_d=\sqrt{a^2+1}$,
and it follows that
\[
f_d = \frac{\sqrt{a^2+1}}{\gamma_d^{1/2}\sqrt[d]{m}}
\]
\end{theorem}
\begin{proof}
The length $\nu_d$ of the shortest vector of the dual lattice $\Lambda^*_d$ can be
easily written as
\begin{multline}
\label{eq:vec}
\nu_d =\min_{(x_0,\ldots,x_{d-1})\neq (0,\ldots,0)} \Bigl\{\sqrt{x_0^2+x_1^2+\cdots+x_{d-1}^2}\\ \Bigm| x_0+a x_1 + a^2 x_2 + \cdots + a^{t-1}x_{t-1} \equiv 0 \mod m\Bigr\},
\end{multline}
where $(x_0,\ldots,x_{d-1})\in \Z^d$, due to the simple structure of the basis of $\Lambda^*_d$~\cite[\S 3.3.4]{KnuACPII}.
Clearly, in general $\nu_d\leq\sqrt{a^2+1}$, as $(-a,1,0,0,\ldots,0)\in\Lambda^*_d$.
However, when $a<\sqrt[d]m$ we have $\nu_d = \sqrt{a^2+1}$, as no vector shorter than $\sqrt{a^2+1}$
can fulfill the modular condition.
To prove this statement, note that a vector $(x_0,\ldots,x_{d-1})\in \Lambda^*_d$ shorter than $\sqrt{a^2+1}$ must have all coordinates smaller than
$a$ in absolute value
(if one coordinate has absolute value $a$, all other coordinates must be zero, and the vector cannot belong to $\Lambda_d^*$).
Then, for every $0\leq j < d$
\[\left|\sum_{i=0}^{j} x_i a^i\right|\leq \sum_{i=0}^{j} \bigl|x_i\bigr| a^j < a^{j+1} < m,\]
so the modular condition in~(\ref{eq:vec}) must be fulfilled by equality with zero. However, let
$t$ be the index of the last nonzero component of $(x_0,\ldots,x_{d-1})$ (i.e., $x_i=0$ for $i>t$):
then, $\bigl|\sum_{i=0}^{t-1} x_i a^i\bigr|< a^t,$ whereas $|x_ta^t|\geq a^t$, so their sum cannot be zero.
\end{proof}
Note that if $m=a^d$, then the vector that is $a$ in position $d-1$ and zero elsewhere is in $\Lambda_d^*$, but by the proof
above shorter vectors cannot be, so
\[
f_d = \frac{a}{\gamma_d^{1/2}\sqrt[d]{m}}=\frac1{\gamma_d^{1/2}}.
\]
% Note: this is actually \leq, because the equation above covers the a = \sqrt[d] case.
Using the approximation $\sqrt{a^2+1}\approx a$, this means that if $a\leq\sqrt[d]m$ then for $2\leq d \leq 8$, $f_d$ cannot be greater than approximately
\begin{multline*}
(4/3)^{-1/4}\approx 0.9306,\quad 2^{-1/6}\approx 0.8909,\quad 2^{-1/4}\approx 0.8409,\quad 2^{-6/10} \approx 0.8122,\quad \\(64/3)^{-1/12}\approx 0.7749,\quad 4^{-3/14}\approx 0.7430,\quad 2^{-1/2}\approx 0.7071
\end{multline*}
for $d=2,\ldots, 8$.
For $d>2$ this is not a problem, as such very small multipliers are not commonly used. However, choosing a multiplier that is smaller than or
equal to $\sqrt m$ has the effect of making it impossible to obtain a figure of merit close to $1$ in dimension $2$.
% Moreover,
% as noted by~\cite{?}, the fact the the shortest vector is almost parallel to the $y$ axis has implications in the
% usability of the generator when computing deviates for a given distribution using the rejection method.
%
Note that, for any $d$, as $a$ drops well below $\sqrt[d]m$ the figure of merit $f_d$ degenerates quickly;
for example, if $a<\sqrt m / 2$ then $f_2$ cannot be greater than $(4/3)^{-1/4} / 2 \approx 0.4653$.
Nonetheless, as soon
as we allow $a$ to be even a tiny bit larger than $\sqrt{m}$, $\nu_2$ (and thus $f_2$) is no longer constrained: indeed, if $m=2^{2w}$, a $(w+1)$-bit multiplier
is sufficient to get a figure of merit in dimension $2$ very close to $1$ (see Table~\ref{tab:lcgcomp}).
MCGs with power-of-two moduli cannot achieve full period: the maximum period is $m/4$.
It turns out that the lattice structure, however, is very similar to the full-period case, once
we replace $m$ with $m/4$ in the definition of the dual lattice.
Correspondingly, we have to replace $\sqrt[d]m$ with $\sqrt[d]{m/4}$
(see~\cite[\S 3.3.4, Exercise 20]{KnuACPII}):
\begin{theorem}
\label{th:bound2}
Consider an MCG with power-of-two modulus $m$, multiplier $a$, and period $m/4$.
Then for every $d\geq2$ and every $a<\sqrt[d]{m/4}$ we have $\nu_d=\sqrt{a^2+1}$,
and it follows that
\[
f_d = \frac{\sqrt{a^2+1}}{\gamma_d^{1/2}\sqrt[d]{m/4}}.
\]
\end{theorem}
Note that Theorem~\ref{th:bound2} imposes limits on the figures of merit for
$(w-1)$-bit multipliers for $2w$-bit MCGs, but does not impose any limits
on $w$-bit multipliers for $2w$-bit MCGs.
In Table~\ref{tab:mcgcomp}, observe that the 31-bit multipliers necessarily have figures of merit $f_2$ smaller than
$(4/3)^{-1/4}\approx 0.9306$ (though one value for $f_2$, namely $0.930577$, is quite close),
but for multipliers of size $32$ and greater we have been able to choose examples
for which $f_2$ is well above $0.99$.
\section{Beyond spectral scores}
In view of Theorem~\ref{eq:vec},
it would seem that using a $(w+1)$-bit multiplier gives us the
full power of a $2w$-bit multiplier: or, at least, this is what the spectral scores suggest empirically. We
now show that, however, on closer inspection, the spectral scores are not telling the whole story.
H\"{o}rmann and Derflinger~\cite{HoDPRNGWSRM} studied multipliers close to the square root to the modulus for LCGs
with $32$ bits of state,
and devised a statistical test that makes generators using such multipliers fail: the intuition behind the test
is that with such multipliers there is a relatively short lattice vector $\bm s = (1/m, a/m)\in \Lambda_2$ that is almost parallel to the $y$ axis. The existence of this vector
creates bias in pairs of consecutive outputs, a bias that can be detected by generating a distribution
using the rejection method: if at some point the density of the distribution increases sharply,
the rejection method will underrepresent certain parts of the distribution and overrepresent others.
We applied an instance of the H\"{o}rmann--Derflinger test to congruential generators (both LCG and MCG) with
$64$ bits of state using a Cauchy distribution on the interval $[-2\..2)$.
We divide the interval into $10^8$ slots that contain the same probability mass,
repeatedly generate by rejection $10^9$ samples from the distribution, and compute a
$p$-value using a $\chi^2$ test on the slots. We consider the number of repetitions after
which the $p$-value is very close to zero\footnote{More precisely, when the $p$-value returned
by the Boost library implementation of the $\chi^2$ test becomes zero, which in this case
happens when the $p$-value goes below $\approx 10^{-16}$.} a measure of the
resilience of the multiplier to the \HD test, and thus a positive feature (that is, a larger number is better).
The results are reported in Tables~\ref{tab:lcgcomp} and~\ref{tab:mcgcomp}.
As we move from small to large multipliers, the number of iterations necessary to detect
bias grows, but within multipliers with the same number of bits there is a very large
variability.\footnote{We also tested a generator with $128$ bits of state and a $64$-bit multiplier, but at that size the bias
is undetectable even with a hundred times as many ($10^{10}$) slots.}
% \begin{figure}
% \centering
% \includegraphics{fig/mult-0}\hspace*{1cm}\includegraphics{fig/mult-1}\hspace*{1cm}\includegraphics{fig/mult-2}
% \caption{\label{fig:lattice}A lattice in which $\bm s = (1/m,a/m)$ is the second shortest vector; a lattice in which
% its components on a minimal basis are $(1,2)$, and one in which the components are $(2,3)$.}
% \end{figure}
The marked differences have a simple explanation: incrementing the
number of bits does not translate immediately into a significantly longer vector $\bm s$.
To isolate generators in which $\bm s$ is less pathological, we have to consider larger multipliers,
as $\|\bm s \|=\sqrt{a^2+1}/m$. In particular, we define the
simple figure of merit $\lambda$ for a full-period LCG as
\[
\lambda = \frac{\|\bm s\|}{1 / \sqrt m}=\frac{\sqrt{a^2+1}/m}{1 / \sqrt m} = \frac{\sqrt{a^2+1}}{\sqrt m}\approx a / \sqrt m
\]
In other words, we measure the length of $\bm s$ with respect to the threshold $1/\sqrt m$ of Theorem~\ref{th:bound}.
In general, for a set of multipliers bounded by $B$, $\lambda \leq B/\sqrt m$.
Note that because of Theorem~\ref{th:bound}, if $a < \sqrt m$
\[
f_2 / \lambda = \frac{\sqrt{a^2+1}}{\gamma_2^{1/2}\sqrt m}\big\slash \frac{\sqrt{a^2+1}}{\sqrt m}= \gamma_2^{-1/2}\approx 0.9306,
\]
that is, for multipliers smaller than $\sqrt m$ the two figures of merit $f_2$ and $\lambda$ are linearly correlated. Just one additional
bit, however, makes the two figures independent (see the entries for $33$-bit multipliers in Table~\ref{tab:lcgcomp},
as well as the entries for $32$-bit multipliers in Table~\ref{tab:mcgcomp}).
For MCGs with power-of-two modulus $m$, $\bm s = (4/m, 4a/m)$, and, in view of Theorem~\ref{th:bound2}, we define
\[
\lambda = \frac{\|\bm s\|}{1 / \sqrt{m/4}}=\frac{\sqrt{a^2+1}/(m/4)}{1 / \sqrt{m/4}} = \frac{\sqrt{a^2+1}}{\sqrt {m/4}}\approx 2 a / \sqrt m
\]
\begin{table}
\centering
\renewcommand{\arraystretch}{1.4}
\setlength{\tabcolsep}{8pt}
\begin{tabular}{lrrrr}
\multicolumn{1}{c}{Bits}&\multicolumn{1}{c}{$a$}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$\lambda$}&\multicolumn{1}{c}{H--D}\\\hline
\input{lcg-comp.tex}
\end{tabular}
\caption{\label{tab:lcgcomp}A comparison of small LCG multipliers for $m=2^{64}$.
In the $32$-bit case, $f_2$ and $\lambda$ are linearly correlated,
and $f_2$ is necessarily smaller than approximately $0.9306$.
For sizes above 32 we show multipliers
with almost perfect $f_2$ but different $\lambda$.
The last column shows the corresponding number of iterations of the \HD test.}
\end{table}
\begin{table}
\centering
\renewcommand{\arraystretch}{1.4}
\setlength{\tabcolsep}{8pt}
\begin{tabular}{lrrrr}
\multicolumn{1}{c}{Bits}&\multicolumn{1}{c}{$a$}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$\lambda$}&\multicolumn{1}{c}{H--D}\\\hline
\input{mcg-comp.tex}
\end{tabular}
\caption{\label{tab:mcgcomp}A comparison of small MCG multipliers for $m=2^{64}$.
In the $31$-bit case, $f_2$ and $\lambda$ are linearly correlated,
and $f_2$ is necessarily smaller than approximately $0.9306$.
For each size above 31 we show multipliers
with almost perfect $f_2$ but different $\lambda$.
The last column shows the corresponding number of iterations of the \HD test.}
\end{table}
In Tables~\ref{tab:lcgcomp} and~\ref{tab:mcgcomp} we report a few small-sized
multipliers together with the figures of merit
$f_2$ and $\lambda$, as well as the number of iterations required by our use of the \HD test: larger values
of $\lambda$ (i.e., larger multipliers) correspond to more resilience to the test.
\section{Potency}
\emph{Potency} is a property of multipliers of LCGs: it is defined as the
minimum $s$ such that $(a-1)^s$ is a multiple of the modulus $m$. Such an $s$
always exists for full-period multipliers, because one of the conditions for
full period is that $a-1$ be divisible by every prime that divides $m$ (when
$m$ is a power of two, this simply means that $a$ must be odd).
Multipliers of low potency generate sequences that do not look very random: in the
case $m$ is a power of two, this is very immediate, as a multiplier $a$ with low
potency is such that $a-1$ is divisible by a large power of two, say, $2^k$. In this
case, the $k$ lowest bits of $ax$ are the same as the $k$ lowest bits of $x$, which means that changes to
the $k$ lowest bits of the state depend only on the fact that we add $c$. For this
reason, one ordinarily chooses multipliers of maximum possible potency,\footnote{Note that ``maximum possible potency'' is a quite
rough statement, because potency is a very rough measure when applied to multipliers that are powers of primes: for example,
when $m=2^{2w}$ a generator with $a-1$ divisible by $2^w$ (but not by $2^{w+1}$) and a generator with $a-1$ divisible
by $2^{2w-1}$ have both potency $2$, but in view of the discussion above their randomness is very different. More precisely, here we choose
to consider only multipliers which leave unchanged that smallest possible number of lower bits.} and since for full period if $m$ is a multiple
of four, then $a-1$ must be a multiple of four, we have to choose $a$ so that $(a-1)/4$ is odd,
that is, $a\bmod 8 = 5$.
Potency has an interesting interaction with the constant $c$, described for the first time
by Durst~\cite{DurULCGPRNG} in response to proposals from Percus and Kalos~\cite{PeKRNGMPP} and Halton~\cite{HalPRT}
to use different constants to generate different streams for multiple processors. If we
take a multiplier $a$ and a constant $c$, then for every $r\in\Z/m\Z$ the generator with
multiplier $a$ and constant $(a-1)r +c$ has the same sequence of the first one, up to
addition with $r$. Indeed, if we consider sequences starting from $x_0$ and $y_0 = x_0 - r$, we have\footnote{All remaining computations in this section are performed in $\Z/m\Z$.}
\[
y_n = a y_{n-1} + (a-1)r + c = a (x_{n-1} - r) + (a-1)r + c = x_n - r.
\]
That is, for a fixed multiplier $a$, the constants $c$ are divided into classes by the equivalence relation of generating
the same sequence up to an additive constant.
How many classes do exist? The answer depends on the potency of $a$, as it comes down to solving
the modular equation
\[
c' - c =(a-1)r
\]
If $a$ has low potency, this equation will be rarely solvable because there will be many equivalence classes: but for the specific case
where $m$ is a power of two and $a\bmod 8 = 5$, it turns out that there are just \emph{two} classes: the
class of constants that are congruent to $1$ modulo $4$, and the class of constants that
are congruent to $3$ modulo $4$. All constants in the first class yield the sequence
$x_n = ax_{n-1}+1$, up to an additive constant, and all constants in the second
class yield the sequence $x_n = ax_{n-1}-1$, up to an additive constant.
It follows that if one tries to use three (or more) different streams, even if one chooses different
constants for the streams, at least two of the streams will be correlated.
If we are willing to weaken slightly our notion of equivalence, in this case
we can extend Durst's considerations: if we consider sequences starting from $x_0$ and $y_0 = - x_0 + r$, then
\[
y_n = a y_{n-1} - ((a-1)r + c) = a (-x_{n-1} + r) - (a-1)r - c = -x_n + r.
\]
Thus, if we consider the equivalence relation of generating sequences that are the same up to an additive constant \emph{and} possibly a sign change,
then \emph{all} sequences generated by a multiplier $a$ of
maximum potency for a power-of-two modulus $m$ are the same, because to prove equivalence we now need to solve just
\emph{one} of the two modular equations
\[
c' - c =(a-1)r \qquad \hbox{and} \qquad c' + c = (a-1)r,
\]
and while the first equation is solvable when the residues of $c$ and $c'$ modulo $4$ are the
same, the second equation is solvable when the residues are different.
\section{Using spectral data from MCGs}
\label{sec:MCGs}
The case of MCGs with power-of-two modulus is different from that of LCGs
because the maximum possible period is of length $m/4$~\cite[\S 3.2.1.2, Theorem C]{KnuACPII}. Thus, there are two
distinct orbits (remember that the state must be odd).
The nature of these orbits is, however, very different depending on whether the
multiplier is congruent to $5$ modulo $8$ or to $3$ modulo $8$: let us say such multipliers are
of \emph{type $5$} and \emph{type $3$}, respectively.
For multipliers of type $5$, each orbit is defined by the residue modulo $4$ of
the state (i.e., $1$ or $3$), whose value depends on the second-lowest
bit.\footnote{This is a consequence of the fact that multipliers of type $5$
do not change the two lowest bits.} Thus, the
remaining upper bits (above the second) go through all possible $m/4$ values. More importantly, the
lattice of points described by the upper bits is simply a translated version of
the lattice $\Lambda_d$ associated with the whole state, so the figures or merit we compute on
$\Lambda_d^*$ describe properties of the generator obtained by discarding the two lowest bits
from the state. Indeed, for every MCG of type $5$
there is an LCG with modulus $m/4$ that generates ``the same sequence'' if the
two low-order bits of every value produced by the MCG are ignored~\cite[\S 3.2.1.2, Exercise 9]{KnuACPII}.
For multipliers of type $3$, instead, each orbit is defined by the residue
modulo $8$ of the state: one orbit alternates between residues $1$ and $3$, and one
orbit alternates between $5$ and $7$.\footnote{Multipliers of type $3$
always leave the lowest bit and the third-lowest bit of the state unchanged.} In this case, there is
no way to use the information we have about the lattice generated by
the whole state to obtain information about the lattice generated by the
part of state that is changing; indeed, there is again a correspondence with
an LCG, but the correspondence involves an alternating sign (again, see~\cite[\S 3.2.1.2, Exercise 9]{KnuACPII}).
For this reason, we (like L'Ecuyer~\cite{LEcTLCGDSGLS}) will consider only MCG multipliers of type $5$.
Note that $a$ and $-a \bmod m = m-a$ have different residue modulo $8$, but the same figures of merit~\cite[\S 3.2.1.2, Exercise 9]{KnuACPII}.
Moreover, in the MCG case
the lattice structure is invariant with respect to inversion modulo $m$, so for each
multiplier its inverse modulo $m$ has again the same figures of merit. In the end, for
each multiplier $a$ of maximum period $m/4$ there are three other related multipliers $a^{-1} \bmod m$,
$(-a) \bmod m$ and $ (- a^{-1}) \bmod m$ with the same figures of merit; of the four, two are of type $3$, and two of type $5$.
\section{Tables}
In this section we provide tables of good multipliers for $32$, $64$ and $128$ bits of state,
updating some of the lists in the classic paper by L'Ecuyer~\cite[Tables~4 and~5]{LEcTLCGDSGLS}.
For LCGs, only multipliers $a$ such that $a\bmod 8$ is either $1$ or $5$ achieve
full period~\cite[\S 3.2.1.2, Theorem A]{KnuACPII}, but we (like L'Ecuyer)
consider only the case of maximum potency, that is, the case when $a\bmod 8$ is $5$.
For MCGs, as we already discussed in Section~\ref{sec:MCGs}, we consider only multipliers of type $5$.
In the end, therefore, we consider in both cases (though for different reasons) only multipliers whose residue modulo $8$ is $5$.
For each multiplier, we considered figures of merit up to dimension $8$, that is,
we computed $f_2$, $f_3$, $f_4$, $f_5$, $f_6$, $f_7$, and $f_8$.
For reasons of space, we present only $f_2$ through $f_6$ in the tables.
We also present two different scores that summarize these figures of merit:
the customary \emph{minimum} spectral score (over all seven dimensions $2$ through $8$)
and a novel \emph{harmonic} spectral score (also over all seven dimensions $2$ through $8$).
The tables present not only the multipliers with the best minimum spectral scores that we found
and the multipliers with the best harmonic spectral scores that we found,
but also multipliers that exhibit a good balance between the two scores,
as described below.
Traditionally, when examining the figures of merit of the spectral test up to
dimension $d$, the \emph{minimum spectral score (up to dimension $d$)} is given by the \emph{minimum} figure of merit
over dimensions $2$ through $d$. L'Ecuyer's paper~\cite{LEcTLCGDSGLS} uses the notation
$M_d(m,a)$ for this aggregate score for a generator with modulus $m$ and multiplier $a$.
We prefer to distinguish the minimum spectral scores of LCGs and MCGs, because the figures
of merit $f_d$ are computed differently for the two kinds of generator when the modulus is a power of two:
we use the notation
\[
\mathscr{M}^+_d(m,a) = \min_{2\leq i\leq d} f_i(m,a)
\]
to denote the minimum spectral score up to dimension $d$ for an LCG, and we use the notation
$\mathscr{M}^*_d(m,a)$ to denote the analogous score for an MCG.
The use of the minimum spectral score seems to have originated in the work of
Fishman and Moore~\cite{FiMEAMCRNG}, where, however, no motivation for this
choice is provided. The definition has been referred to and copied several times,
but
even Knuth argues that the importance of figures of merit decreases
with dimension, and that ``the values of $\nu_t$ for $t\geq 10$ seem to be of no
practical significance whatsoever''~\cite[\S 3.3.4]{KnuACPII}.
Therefore, while L'Ecuyer's paper reports three different minimum figures of merit $M_8(m,a)$, $M_{16}(m,a)$, and $M_{32}(m,a)$ for each multiplier,
here we will report only $\mathscr{M}^+_8(m,a)$ or $\mathscr{M}^*_8(m,a)$.
The disadvantage of the minimum spectral score is that it tends to flatten the
spectral landscape---it is easy, even using small multipliers, to get figures
of merit up to dimension 8 greater than $0.77$. But smaller dimensions should
be given more importance, as a lower figure of merit in a lower dimension is more
likely to have an impact on applications, and a multiplier with a very high minimum spectral score
over $2 \leq d \leq 8$ may have an unremarkable value for, say, $f_2$.
We therefore suggest considering also a second aggregate figure of merit:
\begin{definition}
Let $f_i(m,a)$, $2\leq i\leq d$, be the figures of merit of an LCG multiplier $a$ with modulus $m$. Then, the
\emph{harmonic spectral score (up to dimension $d$)} of $a$ with modulus $M$ is given by
\[
\mathscr H^+_d(m,a) = \frac{1}{H_{d-1}}\sum_{2\leq i\leq d} \frac{f_i(m,a)}{i-1},
\]
where $H_n=\sum_{k=1}^n\frac{1}{k}$ is the $n$-th harmonic number.\footnote{We have used script letters $\mathscr{M}$ and $\mathscr{H}$ to denote spectral scores so
that the harmonic spectral score function $\mathscr{H}_8$ will not be confused with the harmonic number $H_8$.}
Analogously, the notation $\mathscr H^*_d(m,a)$
denotes the harmonic spectral score (up to dimension $d$) for an MCG multiplier $a$ with modulus $m$.
\end{definition}
The effect of the harmonic spectral score is to weight each dimension progressively less,
using weights $1, 1/2, 1/3, \ldots, 1/(d-1)$, and the sum is normalized so that the score is always between $0$ and $1$.
An example of the difference in sensitivity between the minimum spectral score and the harmonic spectral score
is that the minimum spectral score is in practice not limited by Theorem~\ref{th:bound}; for example,
the largest minimum spectral score of a $32$-bit multiplier for a $64$-bit LCG is $0.774103$,
and the largest minimum spectral score
for a $33$-bit multiplier is almost the same: $0.776120$ (an increase of about $0.002$). But the largest spectral harmonic score
goes from $0.867371$ for $32$-bit multipliers to $0.890221$ for $33$-bit multipliers (an increase of almost $0.03$),
reflecting the fact that $f_2$ can get arbitrarily close to $1$ (indeed, there are $33$-bit multipliers for which $f_2=0.998598$).
Another empirical observation in favor of the harmonic spectral score is that as soon
as we look into multipliers with a high harmonic score, we see that their minimum score can
be chosen to be just a few percentage points below the best possible, but at the same time the low-dimensional
figures of merit, which are more relevant, have an increase an order of magnitude larger. These empirical
observations are based on multipliers of at most $35$ bits, which we have enumerated and scored exhaustively,
but the same phenomenon appears to happen in larger cases, which we have sampled randomly.
Following a suggestion by Entacher, Schell, and Uhl~\cite{ESUELALGPS}, we compute
figures of merit using the implementation of the ubiquitous Lenstra--Lenstra--Lov{\'a}sz
basis-reduction algorithm~\cite{LLL} provided by Shoup's NTL library~\cite{ShoNTL}.
For $m=2^{64}$ and $m=^{128}$ we recorded in an output file all tested multipliers whose
minimum spectral score is at least $0.70$ (we used a lower threshold for $m=2^{32}$).
Overall we sampled approximately $6.5\times10^{11}$ multipliers, enough to ensure that
for each pair of modulus and multiplier size reported,
we recorded at least one million multipliers.
(In several cases we recorded as many as 1.5 million or even two million multipliers.)
As a sanity check, we also used the same software to test multipliers of size 63 for LCGs with $m=2^{128}$;
as expected, in view of Theorem~\ref{th:bound} and its consequences,
a random sample of well over $10^{10}$ 63-bit candidates revealed \emph{none}
whose minimum spectral score is at least $0.70$.
In theory, the basis returned by the
algorithm is only approximate, but using a precision parameter
$\delta=1-10^{-9}$ we found only very rarely a basis that was not made of shortest vectors:
we checked all multipliers we selected using the LatticeTester
tool,\footnote{\url{https://github.com/umontreal-simul/latticetester}} which
performs an exhaustive search after basis-reduction preprocessing, and almost all
approximated data we computed turned out to be exact; just a few cases (usually in high dimension)
were slightly off, which simply means that we spuriously stored a few candidates with minimum below $0.70$.
Besides half-width and full-width multipliers, we provide multipliers with up to three
bits more than half-width for $m=2^{32}$ and $m=2^{64}$, and up to seven bits more than half-width for $m=2^{128}$,
as well as multipliers of three-fourths width (24 bits for $m=2^{32}$, 48 bits for $m=2^{64}$, 96 bits for $m=2^{128}$),
because these are experimentally often as fast as smaller multipliers. Additionally, we provide $80$-bit
multipliers for $m=2^{128}$ because
such multipliers can be loaded by the ARM processor with just five instructions, and on an Intel processor one can use
a multiply instruction with an immediate $16$-bit value.
For small multipliers, we try to find candidates with a good $\lambda$: in particular,
we require that the second-most-significant bit be set. For
larger multipliers, we consider only spectral scores, as the effect of a good $\lambda$
becomes undetectable. Since when we consider $(w+c)$-bit multipliers we select candidates
larger than $2^{w+c-1}$, in our tables $2^{c-1}\leq\lambda\leq 2^c$ for LCGs and $2^{c}\leq\lambda\leq 2^{c+1}$ for MCGs.
More precisely, for each type (LCG or MCG), every $m \in \left\{2^{32}, 2^{64}, 2^{128}\right\}$ and
for every multiplier size (in bits) tested, we report (in Tables~\ref{tab:lcg32} through~\ref{tab:mcg128-1}) four multipliers:
\begin{itemize}
\item the best multiplier by harmonic spectral score;
\item the best multiplier by harmonic spectral score within the top millile
of minimum spectral scores.
\item the best multiplier by minimum spectral score;
\item the best multiplier by minimum spectral score within the top millile
of harmonic spectral scores.
\end{itemize}
In case the first-millile criterion provides a duplicate multiplier for a given score, we try the same strategy
with the first \emph{decimillile}, and mark the multiplier with an asterisk, or with the first
\emph{centimillile}, marking with two asterisks, and so on.
The rationale for these reporting criteria is that the best score gives an idea of how far we went in our sampling
procedure, but in principle the best score within the first millile of the alternative
score gives a more balanced multiplier: indeed, within every list of four, the \emph{second} multiplier
(best multiplier by harmonic spectral score within the top millile of minimum spectral scores) is our favorite candidate.
All multipliers we provide are \emph{Pareto optimal} for our dataset: that is,
for each type, modulus, and size there is no other multiplier we examined that is
at least as good on both scores, and strictly improves one.
In particular, for each type, modulus, and size, the multipliers with
distinct scores are pairwise incomparable (i.e., for each pair, the harmonic
spectral score increases and the minimum spectral score decreases, or
vice versa).
\newpage
\newbox\tempa
\newbox\tempb
% Arguments are: heading, text having width of entries, text having width of column
\newcommand\headertowidth[3]{{\setbox\tempa\hbox{\tt #2}\setbox\tempb\hbox{\tt #3}\hbox to \wd\tempb{\hfil\hbox to \wd\tempa{\hfil#1\hfil}}}}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^+_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^+_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x00000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{lcg32.tex}
\end{tabular}
\caption{\label{tab:lcg32}Good multipliers for LCGs with $m=2^{32}$.}
\end{sidewaystable}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^*_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^*_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x00000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{mcg32.tex}
\end{tabular}
\caption{\label{tab:mcg32}Good multipliers for MCGs with $m=2^{32}$.}
\end{sidewaystable}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^+_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^+_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x0000000000000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{lcg64.tex}
\end{tabular}
\caption{\label{tab:lcg64}Good multipliers for LCGs with $m=2^{64}$.}
\end{sidewaystable}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^*_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^*_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x0000000000000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{mcg64.tex}
\end{tabular}
\caption{\label{tab:mcg64}Good multipliers for MCGs with $m=2^{64}$.}
\end{sidewaystable}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^+_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^+_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x00000000000000000000000000000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{lcg128-0.tex}
\end{tabular}
\caption{\label{tab:lcg128-0}Good multipliers for LCGs with $m=2^{128}$ (smaller sizes).}
\end{sidewaystable}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^+_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^+_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x00000000000000000000000000000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{lcg128-1.tex}
\end{tabular}
\caption{\label{tab:lcg128-1}Good multipliers for LCGs with $m=2^{128}$ (larger sizes).}
\end{sidewaystable}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^*_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^*_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x00000000000000000000000000000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{mcg128-0.tex}
\end{tabular}
\caption{\label{tab:mcg128-0}Good multipliers for MCGs with $m=2^{128}$ (smaller sizes).}
\end{sidewaystable}
\begin{sidewaystable}[p]
\centering
\renewcommand{\arraystretch}{0.9}
\setlength{\tabcolsep}{6pt}
\begin{tabular}{@{}rrrrrrrrrl@{}}
\multicolumn{1}{@{}c}{Bits}&\multicolumn{1}{c}{$\mathscr H^*_8(m,a)$}&\multicolumn{1}{c}{$\mathscr M^*_8(m,a)$}&\multicolumn{1}{c}{\headertowidth{$a$}{0x00000000000000000000000000000000}{0x00000000000000000000000000000000}}&\multicolumn{1}{c}{$f_2$}&\multicolumn{1}{c}{$f_3$}&\multicolumn{1}{c}{$f_4$}&\multicolumn{1}{c}{$f_5$}&\multicolumn{1}{c}{$f_6$}&\multicolumn{1}{c@{}}{\headertowidth{\llap{$\lambda$ }}{$0.0\times10^{00}$}{$0.0\times10^{00}$}}\\\midrule
\input{mcg128-1.tex}
\end{tabular}
\caption{\label{tab:mcg128-1}Good multipliers for MCGs with $m=2^{128}$ (larger sizes).}
\end{sidewaystable}
\clearpage
\bibliography{biblio,mult}
\end{document}