[ Branimir Maksimovic @ 25.12.2017. 15:15 ] @
Samo 4 takta na mom Haswell-u za pocetnu magicnu vrednost, a 8 za reciprocitet sa malom preciznoscu (vrcpps).
Deljenje je citavih 25 takta zato sto na Haswell-u i ranije to se splituje u dve operacije.

Code:

~/.../examples/assembler >>> cat latency.asm                                                                                                                                                          
; latency test
format elf64
public recip
public recip1
public recip2
public recip3
public _rdtsc
section '.text' executable
N = 1000000
recip:
recip1:
; Load constants and input
   vbroadcastsd ymm1, [one]
   vpbroadcastq ymm4, [magic]
   mov eax, N
   .loop:
   vmovdqu ymm0, [rdi]
      vpsubq ymm2, ymm4, ymm0
      vfnmadd213pd ymm0, ymm2, ymm1
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm0, ymm2, ymm2
      dec eax
      jnz .loop
   vmovups [rdi], ymm0
   ret

recip2:
; Load constants and input
   vbroadcastsd ymm1, [one]
   mov eax, N
   .loop:
   vmovdqu ymm0, [rdi]
      vcvtpd2ps xmm2,ymm0
      vrcpps xmm2,xmm2
      vcvtps2pd ymm2,xmm2
      vfnmadd213pd ymm0, ymm2, ymm1
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm2, ymm2, ymm0
      vmulpd ymm0, ymm0, ymm0
      vfmadd132pd ymm0, ymm2, ymm2
      dec eax
      jnz .loop
   vmovups [rdi], ymm0
   ret

recip3:
; Load constants and input
   vbroadcastsd ymm1, [one]
   mov eax, N
   .loop:
   vmovdqu ymm0, [rdi]
   vdivpd ymm0,ymm1,ymm0
      dec eax
      jnz .loop
   vmovups [rdi], ymm0
   ret

_rdtsc:
   rdtscp
   shl rdx, 32
   or rax, rdx
   ret

section '.data' writeable align 16
   align 16
   one dq 3FF0000000000000h
   magic dq 7FDE6238502484BAh



Code:

~/.../examples/assembler >>> fasm latency.asm latencya.o                                                                                                                                              
flat assembler  version 1.72  (16384 kilobytes memory)
3 passes, 1024 bytes.



program koji koristi ove rutine je uzet primer u nimu

Code:

~/.../examples/assembler >>> cat latency.nim                                                                                                                                                          
import strfmt,random
randomize()
{.link:"latencya.o".}
proc recip1(x:ptr float64){.importc,cdecl.}
proc recip2(x:ptr float64){.importc,cdecl.}
proc recip3(x:ptr float64){.importc,cdecl.}

proc rdtsc():uint64 =
  # we have to use emit here, nim does not have volatile quanitifier for asm statement
  {.emit:
    """asm volatile(
      ".intel_syntax noprefix\n"
      "rdtscp\n"
      "shl rdx,32\n"
      "or rax,rdx\n"
      ".att_syntax\n"
      :"=a"(`result`)
      :
      :"rdx");
    """.}

var x,y : array[4,float64]
for i in x.mitems :
  i = random(1000.0)
proc f[F](ff:F,title:string) =
  y = x
  echo title
  var t0,t1,t2:array[11,uint64]
  for i in 0..10 :
    t0[i] = rdtsc()
    t1[i] = rdtsc()
    ff(addr y[0])
    t2[i] = rdtsc()
  for i in 0..3 :
    echo "{0:24.18f} {1:24.18f} {2:24.18f}".fmt(x[i],y[i],1/x[i])
  for i in 0..10 :
    echo "{0:f}\t{1:f}".fmt(float64(t1[i]-t0[i]),float64(t2[i]-t1[i])/1000000.0)
f(recip1,"recip1")
f(recip2,"recip2")
f(recip3,"recip3")



Code:

~/.../examples/assembler >>> nim c -d:release latency.nim                                                                                                                                             
Hint: used config file '/home/bmaxa/projects/Nim/config/nim.cfg' [Conf]
Hint: system [Processing]
Hint: latency [Processing]
Hint: strfmt [Processing]
Hint: macros [Processing]
Hint: strutils [Processing]
Hint: parseutils [Processing]
Hint: math [Processing]
Hint: algorithm [Processing]
Hint: unicode [Processing]
Hint: streams [Processing]
Hint: random [Processing]
Hint: times [Processing]
Hint: posix [Processing]
latency.nim(24, 7) Warning: random is deprecated [Deprecated]
Hint:  [Link]
Hint: operation successful (25498 lines compiled; 0.409 sec total; 54.746MiB peakmem; Release Build) [SuccessX]



I evo kako kod mene izgleda output:

Code:

~/.../examples/assembler >>> ./latency                                                                                                                                                                
recip1
  504.966575886583314048     0.001980329090582430     0.001980329090582427
  212.648786567563917056     0.004702589730895429     0.004702589730895429
  701.523519828456414848     0.001425468956827748     0.001425468956827748
  676.426810034068466880     0.001478356542298545     0.001478356542298544
39.000000    4.449594
21.000000    4.079955
24.000000    4.080168
24.000000    4.109700
24.000000    4.225911
24.000000    4.080264
48.000000    4.096860
21.000000    4.525557
57.000000    5.296398
24.000000    4.464465
39.000000    4.292193
recip2
  504.966575886583314048     0.001980329090582427     0.001980329090582427
  212.648786567563917056     0.004702589730895429     0.004702589730895429
  701.523519828456414848     0.001425468956827748     0.001425468956827748
  676.426810034068466880     0.001478356542298544     0.001478356542298544
24.000000    8.693367
21.000000    8.772282
24.000000    8.470779
24.000000    8.529483
24.000000    9.390477
27.000000    10.128975
24.000000    8.951805
21.000000    8.731911
24.000000    8.738061
48.000000    8.615808
24.000000    8.454282
recip3
  504.966575886583314048     0.001980329090582427     0.001980329090582427
  212.648786567563917056     0.004702589730895429     0.004702589730895429
  701.523519828456414848     0.001425468956827748     0.001425468956827748
  676.426810034068466880     0.001478356542298544     0.001478356542298544
24.000000    26.516313
21.000000    26.567454
21.000000    26.639685
39.000000    26.519364
24.000000    26.308188
24.000000    26.426541
24.000000    26.043252
24.000000    26.151756
24.000000    26.226312
24.000000    26.032281
51.000000    26.181567