; ----------------------------- MERSENNA.ASM ---------------------------
;  Random Number generator 'Mersenne Twister' type MT11213A (or MT19937)
;  AgF 2004-03-31
;
;  This random number generator is described in the article by
;  M. Matsumoto & T. Nishimura, in:
;  ACM Transactions on Modeling and Computer Simulation,
;  vol. 8, no. 1, 1998, pp. 3-30. See also:
;  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
;
;  Initialization:
;  TRandomInit must be called before the first call to any of the other
;  random number functions. The seed is any 32-bit integer.
;  You may use TRandomInitByArray instead of TRandomInit if you want more
;  than 32 bits for seed. length is the number of integers in seeds[].
;  length must be > 0, there is no upper limit for length.
;
;  Generating random numbers:
;  TRandom returns a floating point number in the interval 0 <= x < 1 with
;  a resolution of 32 bits.
;  TRandom2 returns a floating point number in the same interval with 
;  a resolution of 52 bits.
;  TRandom3 returns a floating point number in the same interval with 
;  a resolution of 63 bits. (The generated number may become = 1 when rounded
;  to a lower precision).
;  TIRandom returns an integer in the interval defined by min and max with
;  a resolution of 32 bits.
;  TBRandom returns 32 random bits.
;
;  Error conditions:
;  If TRandomInit or TRandomInitByArray has not been called then TRandom
;  and TBRandom keep returning 0, and TIRandom returns min.
;  TIRandom returns a large negative number if max < min.
;
;  C++ prototypes in randoma.h:
;  extern "C" void TRandomInit (int seed);
;  extern "C" void TRandomInitByArray (int seeds[], int length);
;  extern "C" double TRandom (void);
;  extern "C" double TRandom2 (void);
;  extern "C" long double TRandom3 (void);
;  extern "C" int TIRandom (int min, int max);
;  extern "C" unsigned TBRandom ();
;
;   2001, 2004 Agner Fog.
;  GNU General Public License www.gnu.org/copyleft/gpl.html
; ----------------------------------------------------------------------

ifdef	??version      ; for Borland TASM v. 3.0 or later
  .386
  include p4macros.asi
  .model flat
else                   ; for Microsoft ML version 6.15 or later
  .686
  .xmm
  .model flat
endif

TEMPERING EQU 1              ; set to 0 if no tempering (improves speed by 25%)

IF 0
; define constants for MT11213A:
N       = 351
M       = 175
R       = 19
MATRIX_A = 0E4BD75F5H
TEMU    = 11
TEMS    = 7
TEMT    = 15
TEML    = 17
TEMB    = 655E5280H
TEMC    = 0FFD58000H

ELSE
; or constants for MT19937:
N       = 624
M       = 397
R       = 31
MATRIX_A = 09908B0DFH
TEMU    = 11
TEMS    = 7
TEMT    = 15
TEML    = 18
TEMB    = 9D2C5680H
TEMC    = 0EFC60000H

ENDIF

LOWER_MASK = (1 SHL R) - 1             ; lower R bits
UPPER_MASK = -1 SHL R                  ; upper 32-R bits

; data segment
IFDEF	??version      ; for Borland TASM v. 3.0 or later
  _DATA1 SEGMENT PARA PUBLIC 'DATA'
ELSE
  .DATA
ENDIF

LMASK   DD      LOWER_MASK             ; constants
UMASK   DD      UPPER_MASK
MATA    DD      MATRIX_A
ALIGN   8
FLOAT32 DQ      0                      ; premade float, 32 bits resolution
FLOAT52 DQ      0                      ; premade float, 52 bits resolution
FLOAT63 DT      1.5                    ; premade float, 63 bits resolution
FLREADY DB      0                      ; 1=FLOAT32 valid, 2=FLOAT52 valid, 4=FLOAT63 valid
INSTRSET DB     0                      ; >= 4 if use of XMM registers (SSE2) allowed
ALIGN   4
ONE     DD      1.0

IF      TEMPERING
ALIGN   16
TMB     DD      TEMB, TEMB, TEMB, TEMB ; constants
TMC     DD      TEMC, TEMC, TEMC, TEMC
PREMADE DD      0, 0, 0, 0             ; tempered numbers, ready to use
IF      N AND 3
        DD      (4-(N AND 3)) DUP (0)  ; last PREMADE unaligned overrun
ENDIF  
PREMI   DD      4*4                    ; index to next PREMADE number
ENDIF

MTI     DD      0                      ; index into MT buffer
ALIGN   16
ALENGTH DD      4  dup (0)             ; buffer km wraparound and temporary storage of length
MT      DD      N  dup (0)             ; history buffer
        DD      4  dup (0)             ; buffer kk wraparound

IFDEF	??version      ; Borland TASM
  _DATA1 ENDS
  _TEXT segment DWORD PUBLIC 'CODE'
  FLAT GROUP _DATA1,_TEXT
  ASSUME DS:FLAT,CS:FLAT
ELSE
  .CODE
ENDIF

; code segment

PublicAlias MACRO MangledName ; macro for giving a function alias public names
        MangledName label near
        public MangledName
ENDM

extrn _InstructionSet:near             ; detect CPU instruction set supported

_TRandomInit@4 PROC NEAR               ; stdcall entry for DLL
public _TRandomInit@4
        pop     edx                    ; return address
        pop     eax                    ; seed
        push    edx                    ; put return address back after removing parameters
        jmp     TRandomInit_e1
_TRandomInit@4 ENDP

TRandomInit PROC NEAR
public TRandomInit                     ; mangled names are not needed if extern "C" declatation
PublicAlias _TRandomInit               ; extern "C" name
PublicAlias ?TRandomInit@@YAXH@Z       ; MS mangled name
PublicAlias @TRandomInit$qi            ; Borland mangled name
PublicAlias _TRandomInit__Fi           ; Gnu mangled name (Windows)
PublicAlias TRandomInit__Fi            ; Gnu mangled name (Linux)
PublicAlias _Z11TRandomIniti           ; Gnu mangled name (UNIX)
 
        MOV     EAX, [ESP+4]           ; seed
TRandomInit_e1  LABEL NEAR        
        call    INIT1                  ; initialize MT buffer with seeds
        mov     [FLREADY], 0
        CMP     [INSTRSET], 4  ; can we use XMM registers and SSE2 ?
        jb      R05
        call    M70                    ; update MT buffer
IF      TEMPERING        
        call    M30                    ; premake 4 tempered random numbers
ENDIF
R05:    ret
TRandomInit ENDP
        

INIT1   PROC    NEAR    ; make random seeds from eax and put them into MT buffer
        xor     ecx, ecx        
R10:    mov     MT[ecx*4], eax
        mov     edx, eax
        shr     eax, 30
        xor     eax, edx
        imul    eax, 1812433253D
        inc     ecx
        add     eax, ecx        
        cmp     ecx, N
        jb      R10
        shl     ecx, 2
        mov     [MTI], ecx
        
        ; check microprocessor and OS support for XMM instructions:
        cmp     [INSTRSET], 0
        ja      INSTRSET_KNOWN
        call    _InstructionSet        ; detect instruction set
        mov     [INSTRSET], al
INSTRSET_KNOWN:        
        ret      
INIT1   ENDP

_TRandomInitByArray@8 PROC NEAR        ; stdcall entry for DLL
PUBLIC _TRandomInitByArray@8
        pop     eax                    ; return address
        pop     ecx                    ; seeds
        pop     edx                    ; length
        push    eax                    ; put return address back
        push    ebx
        push    esi
        push    edi
        push    ebp
        mov     ebx, ecx               ; seeds
        mov     ebp, edx               ; length
        jmp     TRandomInitByArray_e1
_TRandomInitByArray@8 ENDP

TRandomInitByArray PROC NEAR
public TRandomInitByArray
PublicAlias _TRandomInitByArray                  ; extern "C" name
PublicAlias ?TRandomInitByArray@@YAXQAKH@Z       ; MS mangled name
PublicAlias @TRandomInitByArray$qpuli            ; Borland mangled name
PublicAlias _TRandomInitByArray__FPUli           ; Gnu mangled name (Windows)
PublicAlias TRandomInitByArray__FPUli            ; Gnu mangled name (Linux)

        push    ebx
        push    esi
        push    edi
        push    ebp
        mov     ebx, [esp+20]          ; seeds
        mov     ebp, [esp+24]          ; length
TRandomInitByArray_e1 LABEL NEAR
        mov     [ALENGTH], ebp         ; save length
        mov     eax, 19650218
        call    INIT1                  ; init_genrand(19650218UL);
        mov     [FLREADY], 0
        test    ebp, ebp
        jle     length_zero            ; error: length <= 0
        xor     edi, edi               ; j = 0
        lea     esi, [edi+1]           ; i = 1
        mov     ecx, offset ds:MT      ; ecx = data pointer to reduce code size
        cmp     ebp, N
        ja      max_N_length
        mov     ebp, N                 ; k = max (N,length)
max_N_length:

        ; for (; k; k--) {
k_loop1:mov     eax, [ecx+esi*4-4]     ; mt[i-1]
        mov     edx, eax
        shr     eax, 30
        xor     eax, edx               ; mt[i-1] ^ (mt[i-1] >> 30)
        imul    eax, 1664525           ; * 1664525
        xor     eax, [ecx+esi*4]       ; ^ mt[i]
        add     eax, [ebx+edi*4]       ; + seeds[j]
        add     eax, edi               ; + j
        mov     [ecx+esi*4], eax       ; save in mt[i]
        inc     esi                    ; i++
        inc     edi                    ; j++
        cmp     esi, N
        jb      i_less_N               ; if (i>=N)
        mov     eax, [ecx+(N-1)*4]     ; mt[0] = mt[N-1];
        mov     [ecx], eax
        mov     esi, 1                 ; i=1;
i_less_N:
        cmp     edi, [ALENGTH]         ; length
        jb      j_less_length          ; if (j>=length)
        xor     edi, edi               ; j = 0;
j_less_length:
        dec     ebp                    ; k--
        jnz     k_loop1                ; first k loop
end_k_loop1:

        mov     ebp, N-1               ; k
k_loop2:mov     eax, [ecx+esi*4-4]     ; mt[i-1]
        mov     edx, eax
        shr     eax, 30
        xor     eax, edx               ; mt[i-1] ^ (mt[i-1] >> 30)
        imul    eax, 1566083941        ; * 1566083941
        xor     eax, [ecx+esi*4]       ; ^ mt[i]
        sub     eax, esi               ; - i
        mov     [ecx+esi*4], eax       ; save in mt[i]
        inc     esi                    ; i++
        cmp     esi, N
        jb      i_less_N2              ; if (i>=N)
        mov     eax, [ecx+(N-1)*4]     ; mt[0] = mt[N-1];
        mov     [ecx], eax
        mov     esi, 1                 ; i=1;
i_less_N2:
        dec     ebp                    ; k--
        jnz     k_loop2                ; second k loop
        mov     dword ptr[ecx],80000000H ; mt[0] = 0x80000000
length_zero:
        CMP     [INSTRSET], 4  ; can we use XMM registers and SSE2 ?
        jb      IA90
        call    M70                    ; update MT buffer
IF      TEMPERING        
        call    M30                    ; premake 4 tempered random numbers
ENDIF
IA90:   pop     ebp        
        pop     edi
        pop     esi
        pop     ebx
        ret
TRandomInitByArray ENDP


TBRandom PROC NEAR                     ; generate random bits
public TBRandom
PublicAlias _TBRandom@0                ; stdcall entry for DLL
PublicAlias _TBRandom                  ; extern "C" name
PublicAlias ?TBRandom@@YAIXZ           ; MS mangled name
PublicAlias @TBRandom$qv               ; Borland mangled name
PublicAlias _TBRandom__Fv              ; Gnu mangled name (Windows)
PublicAlias TBRandom__Fv               ; Gnu mangled name (Linux)
PublicAlias _Z8TBRandomv               ; Gnu mangled name (UNIX)

        cmp     [INSTRSET], 4          ; can we use XMM registers and SSE2 ?
        jb      R20

        ; this version uses XMM registers and SSE2 instructions:
IF      TEMPERING        
        mov     edx, [PREMI]           ; index into premade numbers
        mov     eax, [PREMADE+edx]     ; fetch premade random number
        add     edx, 4
        mov     [PREMI], edx
        cmp     edx, 4*4
        jnb     M30
        ret                            ; return premade number

M30     LABEL   NEAR
; PREMADE list is empty. Make 4 more numbers ready for next call:
        mov     ecx, [MTI]             ; fetch 4 numbers from MT buffer
        movdqa  xmm0, MT[ecx]
        movdqa  xmm1, xmm0             ; tempering algorithm
        psrld   xmm0, TEMU
        pxor    xmm0, xmm1
        movdqa  xmm1, xmm0        
        pslld   xmm0, TEMS
        pand    xmm0, [TMB]
        pxor    xmm0, xmm1
        movdqa  xmm1, xmm0        
        pslld   xmm0, TEMT
        pand    xmm0, [TMC]
        pxor    xmm0, xmm1
        movdqa  xmm1, xmm0        
        psrld   xmm0, TEML
        pxor    xmm0, xmm1
        movdqa  [PREMADE], xmm0
        mov     [PREMI], 0             ; save 4 PREMADE numbers ready to use
        add     ecx, 4*4               ; increment MTI index into MT buffer
        mov     [MTI], ecx
        cmp     ecx, N*4
        jae     M41
        ret                            ; return random number in eax

; MT buffer exhausted. Make N new numbers ready for next time
M41:                                   ; eax is the random number to return
IF      N AND 3                        ; if N is not divisible by 4
        NVALID = N AND 3               ; only NVALID of the numbers in xmm0 are valid
        movdqu  [PREMADE + (4-NVALID)*4], xmm0
        mov     [PREMI], (4-NVALID)*4  ; save index to first valid entry in PREMADE
ENDIF        
ELSE    ; no tempering
        mov     ecx, [MTI]             ; fetch number from MT buffer
        mov     eax, MT[ecx]
        add     ecx, 1*4               ; increment MTI index into MT buffer
        mov     [MTI], ecx
        cmp     ecx, N*4
        jae     M70
        ret
ENDIF        
        
M70     LABEL   NEAR
; MT buffer is empty. Fill it up
        movd    xmm3, [UMASK]          ; load constants
        movd    xmm4, [LMASK]
        movd    xmm5, [MATA]
        pshufd  xmm3, xmm3, 0          ; broadcast constants
        pshufd  xmm4, xmm4, 0
        pshufd  xmm5, xmm5, 0
        mov     ecx,  offset MT        ; kk
        mov     edx,  offset MT+M*4    ; km

M80:    ; kk loop
        movdqa  xmm2, [ecx]            ; mt[kk]
        movd    xmm6, [ecx+16]        
        movdqa  xmm1, [ecx]            ; mt[kk]        
        movss   xmm2, xmm6             ; faster than movdqu xmm2, [ecx+4]
        pshufd  xmm2, xmm2, 00111001B  ; mt[kk+1]
        movq    xmm0, [edx]            ; mt[km]
        movq    xmm6, [edx+8]
        punpcklqdq xmm0, xmm6          ; faster than movdqu xmm0, [edx]
        pand    xmm1, xmm3             ; mt[kk] & UPPER_MASK
        pand    xmm2, xmm4             ; mt[kk+1] & LOWER_MASK
        por     xmm1, xmm2             ; y        
        movdqa  xmm2, xmm1             ; y
        pslld   xmm1, 31               ; copy bit 0 into all bits
        psrad   xmm1, 31               ; -(y & 1)
        pand    xmm1, xmm5             ; & MATRIX_A
        psrld   xmm2, 1                ; y >> 1
        pxor    xmm0, xmm1
        pxor    xmm0, xmm2
        movdqa  [ecx], xmm0            ; result into mt[kk]
        cmp     ecx, offset MT + (N-4)*4
        jae     m90                    ; exit loop when kk past end of buffer
        add     ecx, 16                ; kk += 4
        add     edx, 16                ; km += 4
        cmp     edx, offset MT + (N-4)*4
        jbe     m80                    ; skip unless km wraparound
        sub     edx, N*4               ; km wraparound
        movdqu  xmm0, [MT+(N-4)*4]     ; copy end to before begin for km wraparound
        movdqa  [MT-4*4], xmm0        
        movdqa  xmm0, [MT]             ; copy begin to after end for kk wraparound
        movdqu  [MT+N*4], xmm0
        jmp     M80

m90:    ; loop finished. discard excess part of last result
        mov     [MTI], 0
        ret                            ; random number is still in eax
        
        
; this version is for old processors without XMM support:
R20:    mov     ecx, [MTI]
        cmp     ecx, N*4
        jnb     short R50              ; buffer is empty, fill it   
R40:    mov     eax, MT[ecx]
        add     ecx, 4
        mov     [MTI], ecx
        
IF      TEMPERING                      ; optional tempering
        mov     edx, eax
        shr     eax, TEMU
        xor     eax, edx
        mov     edx, eax
        shl     eax, TEMS
        and     eax, TEMB
        xor     eax, edx
        mov     edx, eax
        shl     eax, TEMT
        and     eax, TEMC
        xor     eax, edx
        mov     edx, eax
        shr     eax, TEML
        xor     eax, edx
ENDIF   ; TEMPERING     
        ret

        ; fill buffer with random numbers
R50:    push    ebx        
        mov     ecx, offset MT
        mov     ebx, offset MT + M*4
        ; kk loop
R60:    mov     eax, [ecx]
        mov     edx, [ecx+4]
        and     eax, UPPER_MASK
        and     edx, LOWER_MASK
        or      eax, edx
        shr     eax, 1
        sbb     edx, edx
        and     edx, MATRIX_A
        xor     eax, edx
        xor     eax, [ebx]
        mov     [ecx], eax
        add     ebx, 4
        cmp     ebx, offset MT + N*4
        jb      short R70
        mov     eax, [MT]
        mov     [ebx], eax ; copy begin of table to after end to simplify kk+1 wraparound
        mov     ebx, offset MT
R70:    add     ecx, 4
        cmp     ecx, offset MT + N*4
        jb      R60                    ; loop end        
        xor     ecx, ecx
        mov     [MTI], ecx        
        pop     ebx
        jmp     R40        
        
TBRandom ENDP


TRandom PROC NEAR                      ; generate random float with 32 bits resolution
public TRandom
PublicAlias _TRandom@0                 ; stdcall entry for DLL
PublicAlias _TRandom                   ; extern "C" name
PublicAlias ?TRandom@@YANXZ            ; MS mangled name
PublicAlias @TRandom$qv                ; Borland mangled name
PublicAlias _TRandom__Fv               ; Gnu mangled name (Windows)
PublicAlias TRandom__Fv                ; Gnu mangled name (Linux)
PublicAlias _Z7TRandomv                ; Gnu mangled name (UNIX)

        test    [FLREADY], 1           ; do we have a number ready to use?
        jz      short TR80
TR10:   ; a number is already ready
        fld     [FLOAT32]              ; premade random number in interval [1,2)
        fsub    [ONE]                  ; adjust interval
TR20:   ; make a new number ready for next time
        call    TBRandom               ; random bits
        mov     edx, eax               ; make a random number ready for next time
        shr     eax, 12
        or      eax, 3FF00000H
        shl     edx, 20
        mov     dword ptr [FLOAT32+4], eax
        mov     dword ptr [FLOAT32], edx
        ret
TR80:   ; first-time call: no premade number is ready
        call    TR20                   ; make a random number
        or      [FLREADY], 1           ; remember that we have a number
        jmp     TR10                   ; continue

TRandom ENDP


TRandom2 PROC NEAR                     ; generate random float with 52 bits resolution
public TRandom2
PublicAlias _TRandom2@0                ; stdcall entry for DLL
PublicAlias _TRandom2                  ; extern "C" name
PublicAlias ?TRandom2@@YANXZ           ; MS mangled name
PublicAlias @TRandom2$qv               ; Borland mangled name
PublicAlias _TRandom2__Fv              ; Gnu mangled name (Windows)
PublicAlias TRandom2__Fv               ; Gnu mangled name (Linux)
PublicAlias _Z8TRandom2v               ; Gnu mangled name (UNIX)

        test    [FLREADY], 2           ; do we have a number ready to use?
        jz      short TS80
TS10:   ; a number is already ready
        fld     [FLOAT52]              ; premade random number in interval [1,2)
        fsub    [ONE]                  ; adjust interval
TS20:   ; make a new number ready for next time
        call    TBRandom               ; random bits
        shr     eax, 12
        or      eax, 3FF00000H
        mov     dword ptr [FLOAT52+4], eax
        call    TBRandom               ; more random bits
        mov     dword ptr [FLOAT52], eax
        ret
TS80:   ; first-time call: no premade number is ready
        call    TS20                   ; make a random number
        or      [FLREADY], 2           ; remember that we have a number
        jmp     TS10                   ; continue

TRandom2 ENDP


TRandom3 PROC NEAR                     ; generate random float with 63 bits resolution
public TRandom3
PublicAlias _TRandom3@0                ; stdcall entry for DLL
PublicAlias _TRandom3                  ; extern "C" name
PublicAlias ?TRandom3@@YAOXZ           ; MS mangled name
PublicAlias @TRandom3$qv               ; Borland mangled name
PublicAlias _TRandom3__Fv              ; Gnu mangled name (Windows)
PublicAlias TRandom3__Fv               ; Gnu mangled name (Linux)
PublicAlias _Z8TRandom3v               ; Gnu mangled name (UNIX)

        test    [FLREADY], 4           ; do we have a number ready to use?
        jz      short TT80
TT10:   ; a number is already ready
        fld     [FLOAT63]              ; premade random number in interval [1,2)
        fsub    [ONE]                  ; adjust interval
TT20:   ; make a new number ready for next time
        call    TBRandom               ; random bits
        shr     eax, 1
        or      eax, 80000000H
        mov     dword ptr [FLOAT63+4], eax
        call    TBRandom               ; more random bits
        mov     dword ptr [FLOAT63], eax
        ret
TT80:   ; first-time call: no premade number is ready
        call    TT20                   ; make a random number
        or      [FLREADY], 4           ; remember that we have a number
        jmp     TT10                   ; continue

TRandom3 ENDP


TIRandom PROC   NEAR
public TIRandom
PublicAlias _TIRandom                  ; extern "C" name
PublicAlias ?TIRandom@@YAHHH@Z         ; MS mangled name
PublicAlias @TIRandom$qii              ; Borland mangled name
PublicAlias _TIRandom__Fii             ; Gnu mangled name (Windows)
PublicAlias TIRandom__Fii              ; Gnu mangled name (Linux)
PublicAlias _Z8TIRandomii              ; Gnu mangled name (UNIX)

        call    TBRandom               ; make random number
        mov     edx, [esp+8]           ; max
        mov     ecx, [esp+4]           ; min
        sub     edx, ecx
        js      short RERROR           ; max < min
        add     edx, 1                 ; max - min + 1
        mul     edx                    ; multiply random number by interval and truncate
        lea     eax, [edx+ecx]         ; add min
        ret
RERROR: mov     eax, 80000000H         ; error exit
        ret
TIRandom ENDP


_TIRandom@8 PROC   NEAR                ; stdcall version for DLL
PUBLIC _TIRandom@8
        call    TBRandom               ; make random number
        mov     edx, [esp+8]           ; max
        mov     ecx, [esp+4]           ; min
        sub     edx, ecx
        js      short RERROR1          ; max < min
        add     edx, 1                 ; max - min + 1
        mul     edx                    ; multiply random number by interval and truncate
        lea     eax, [edx+ecx]         ; add min
        ret     8
RERROR1:mov     eax, 80000000H         ; error exit
        ret     8
_TIRandom@8 ENDP        

END


comment ~
Example of calling from C++ program:

#include "arandom.h"
#include <stdio.h>
#include <time.h>

int seed = time(0);
TRandomInit (seed);
for (int i=0; i<100; i++) {
  printf ("\n%14.10f", TRandom());}

~
