[ Branimir Maksimovic @ 28.09.2018. 04:26 ] @
Interesantna stvar kod sortiranja listi je u tome da ako prespojite node sekvencijalni access bude order of magnitude sporiji ;) Znaci sekvencijalni quick sort liste je brzi od merge sorta ;P Ovo je neki moj quick sort koji je drasticno brzi od C++ (gcc) sorta liste: Code: #ifndef VLIBQSORT_H #define VLIBQSORT_H #include <pthread.h> #include <algorithm> #include <functional> namespace VLib{ using std::swap; template <typename T, typename F> inline T median_of_three(T a, T b, T c, F f) { if (f(*a , *b)) if (f(*b , *c)) return b; else if (f(*a , *c)) return c; else return a; else if (f(*a , *c)) return a; else if (f(*b , *c)) return c; else return b; } template <typename T, typename F> void insertion_sort(T begin, T end, F f) { if(begin == end)return; T i = begin; ++i; for(;i!=end;++i) { typeof(*i) v = *i; T j = i; --j; while(f(v,*j)) { T k = j; ++k; *k = *j; if(j == begin){--j;break;} --j; } ++j; *j = v; } } template <typename T,typename F> void qsort(T begin, T end, unsigned size, int t, F f) { struct Data{ T i, j; unsigned k,t; F f; static void* qs(void* d) { Data* t = (Data*)d; VLib::qsort(t->i,t->j,t->k,t->t-1,t->f); return 0; } }; class Thread{ public: typedef void* (*f_t)(void*); Thread(f_t f,void* p) : f_(f),data_(p) { pthread_attr_init(&attr_); pthread_attr_setstacksize(&attr_,256*1024); } void start() { pthread_create(&tid_,&attr_,f_,data_); } void join() { pthread_join(tid_,0); } ~Thread() { pthread_attr_destroy(&attr_); join(); } private: pthread_t tid_; pthread_attr_t attr_; f_t f_; void* data_; }; if(begin == end)return; T high = end, low = begin; if(!size) { T tmp = begin; while(tmp!=end){ high=tmp++;++size; } } else --high; if(size == 1)return; if(size <= 16) { insertion_sort(begin,end,f); return; } unsigned count = 0; T it = begin; while(++count<size/2)++it; it = median_of_three(begin,it,high,f); typeof(*it) pivot = *it; unsigned counthigh = 0,countlow = 0; do { while(high != low && f(pivot,*high)){ --high;++counthigh; } while(low != high && f(*low,pivot)){ ++low;++countlow; } if(low != high && !f(*low,pivot) && !f(pivot,*low) && !f(pivot,*high) && !f(*high,pivot)) { while(high != low && !f(*high,*low) && !f(*low,*high)){ --high;++counthigh; } } swap(*low,*high); }while(low != high); T i = low; while(++i != end && !f(*i,*low) && !f(*low,*i) )--counthigh; if(t>0 && size > 1000) { Data d1 = {begin,low,countlow,t,f}, d2 = {i,end,counthigh,t,f}; Thread t1(Data::qs,&d1),t2(Data::qs,&d2); t1.start(); t2.start(); } else { VLib::qsort(begin,low,countlow,0,f); VLib::qsort(i,end,counthigh,0,f); } } template <typename T> inline void qsort(T begin, T end, unsigned size = 0, int t = 2) { VLib::qsort(begin,end,size,t,std::less<typename std::iterator_traits<T>::value_type>()); } } #endif rezultat: Code: ~/.../examples/sort >>> ./bench radix asm sort 536.088 asm bitonic sort 3226 C++ sort 1201.64 VLib sort 1737.32 C qsort 2548.3 C++ list sort 9933.06 VLib list sort 1847.04 all set Znaci moj sort je 5* sic brzi od std C++ sorta dok je naravno sortiranje niza sporije. Pazite sad asembler merge sort vs asembler radix sorta liste protiv radix sorta niza: Code: ~/.../examples/sort >>> cat list_sort.asm format elf64 executable 3 at 0x100000000 struc Node { .next dq ? .data dd ? .size = $-.next } virtual at 0 n Node end virtual include 'import64.inc' interpreter '/lib64/ld-linux-x86-64.so.2' needed 'libc.so.6','./speedupavx2.so' import atoi,printf,puts,exit,rand_r,time,malloc, \ radix_sort_avx2 segment executable entry $ mov [N],4096*4096 cmp [rsp],dword 1 je .skip mov rdi,[rsp+16] call [atoi] cmp eax,1 jl .exit mov [N],eax .skip: xor edi,edi call [time] mov [seed],eax ; seed on time mov rdi,fmt1 mov esi,[seed] xor eax,eax call [printf] ; print seed mov rdi,fmt4 mov esi,[N] xor eax,eax call [printf] mov rbx,list1 push rbx call init_list mov rbx,list2 push rbx call init_list add rsp,16 mov edi,[N] imul rdi,4 call [malloc] mov [array],rax mov rdi,[list2] mov rsi,[list1] call assign_list mov rdi,[array] mov rsi,[list1] call assign_array mov rdi,fmtu mov rbx,list1 push rbx call print_list add rsp,8 radix_avx2 equ 1 if defined radix_avx2 call init_time mov rdi,[array] mov esi,[N] call [radix_sort_avx2] mov rdi,fmtar call time_me end if if 1 call init_time mov rbx,[list1] push rbx call radix_sort pop rbx mov [list1],rbx mov rdi,fmtr call time_me end if if 1 call init_time mov rbx,[list2] push rbx call sort pop rbx mov [list2],rbx mov rdi,fmtm call time_me end if mov rdi,[list1] mov rsi,[list2] call equal_list mov rdi,fmte test eax,eax mov rbx,fmtne cmovz rdi,rbx call [puts] mov rdi,[array] mov rsi,[list2] call equal_array mov rdi,fmte test eax,eax mov rbx,fmtne cmovz rdi,rbx call [puts] mov rdi,fmts mov rbx,list1 push rbx call print_list add rsp,8 mov rdi,[list1] call length mov rdx,rcx mov rdi,fmt3 mov rsi,n.size xor eax,eax call [printf] xor edi,edi .exit: call [exit] init_list: mov ebx,[N] mov r12,[rsp+8] .L0: mov edi,n.size call [malloc] mov rcx,[r12] mov [rax+n.next],rcx mov [r12],rax mov rdi,seed if 1 call [rand_r] else rdrand eax end if xor edx,edx mov ecx,[N] div ecx mov rcx,[r12] mov [rcx+n.data],edx dec ebx jnz .L0 ret print_list: call [puts] mov rbx,[rsp+8] mov rbx,[rbx] mov r12,16 .L0: test rbx,rbx jz .exit mov rdi,fmt2 mov rsi,[rbx+n.next] mov edx,[rbx+n.data] xor eax,eax call [printf] mov rbx,[rbx+n.next] dec r12 jz .exit jmp .L0 .exit: ret ; rsi source, rdi dest assign_list: .L0: test rsi,rsi jz .exit test rdi,rdi jz .exit mov eax,[rsi+n.data] mov [rdi+n.data],eax mov rsi,[rsi+n.next] mov rdi,[rdi+n.next] jmp .L0 .exit: ret assign_array: .L0: test rsi,rsi jz .exit mov eax,[rsi+n.data] mov [rdi],eax mov rsi,[rsi+n.next] add rdi,4 jmp .L0 .exit: ret equal_list: mov eax,1 .L0: test rsi,rsi jz .exit test rdi,rdi jz .exit mov eax,[rsi+n.data] cmp [rdi+n.data],eax jnz .false mov rsi,[rsi+n.next] mov rdi,[rdi+n.next] jmp .L0 .exit: ret .false: xor eax,eax ret equal_array: mov eax,1 .L0: test rsi,rsi jz .exit mov eax,[rsi+n.data] cmp [rdi],eax jnz .false mov rsi,[rsi+n.next] add rdi,4 jmp .L0 .exit: ret .false: xor eax,eax ret ; [rsp+8] list to sort sort: mov rdi,[rsp+8] call length cmp rcx,1 jle .exit shr rcx,1 ; middle sub rsp,16 ; left,right mov qword[rsp],0 mov qword[rsp+8],0 mov rbx,[rsp+8+16] .L0: ; append to left mov rax,[rsp] mov rdx,[rbx+n.next] mov [rbx+n.next],rax mov [rsp],rbx mov rbx,rdx dec rcx jnz .L0 .L1: ; append to right mov rax,[rsp+8] mov rdx,[rbx+n.next] mov [rbx+n.next],rax mov [rsp+8],rbx mov rbx,[rbx+n.next] mov rbx,rdx test rbx,rbx jnz .L1 sub rsp,8 ; result mov rbx,[rsp+8] mov [rsp],rbx call sort mov rbx,[rsp] mov [rsp+8],rbx mov rbx,[rsp+16] mov [rsp],rbx call sort mov rbx,[rsp] mov [rsp+16],rbx call merge mov rbx,[rsp] add rsp,24 mov [rsp+8],rbx .exit: ret ; [rsp+8] output , [rsp+16] left, [rsp+24] right merge: sub rsp,8 ; append position mov qword[rsp+16],0 mov qword[rsp],0 .L0: cmp qword[rsp+24],0 jz .right cmp qword[rsp+32],0 jz .left mov rax,[rsp+24] mov ebx,[rax+n.data] mov rcx,[rsp+32] cmp ebx,[rcx+n.data] jl .add_left .add_right: cmp qword[rsp],0 je .just_set_right mov rdx,[rsp] mov [rdx+n.next],rcx mov rdx,[rcx+n.next] mov [rsp+32],rdx mov qword[rcx+n.next],0 mov [rsp],rcx jmp .L0 .add_left: cmp qword[rsp],0 je .just_set_left mov rdx,[rsp] mov [rdx+n.next],rax mov rdx,[rax+n.next] mov [rsp+24],rdx mov qword[rax+n.next],0 mov [rsp],rax jmp .L0 .just_set_left: mov rdx,[rax+n.next] mov qword[rax+n.next],0 mov [rsp],rax mov [rsp+16],rax mov [rsp+24],rdx jmp .L0 .just_set_right: mov rdx,[rcx+n.next] mov qword[rcx+n.next],0 mov [rsp],rcx mov [rsp+16],rcx mov [rsp+32],rdx jmp .L0 .right: cmp qword[rsp+32],0 jz .exit mov rcx,[rsp+32] cmp qword[rsp],0 je .just_set_right_only mov rdx,[rsp] mov [rdx+n.next],rcx mov [rsp],rcx mov rdx,[rcx+n.next] mov qword[rcx+n.next],0 mov [rsp+32],rdx jmp .right .just_set_right_only: mov rdx,[rcx+n.next] mov qword[rcx+n.next],0 mov [rsp],rcx mov [rsp+16],rcx mov [rsp+32],rdx jmp .right .left: cmp qword[rsp+24],0 jz .exit mov rax,[rsp+24] cmp qword[rsp],0 je .just_set_left_only mov rdx,[rsp] mov [rdx+n.next],rax mov [rsp],rax mov rdx,[rax+n.next] mov qword[rax+n.next],0 mov [rsp+24],rdx jmp .left .just_set_left_only: mov rdx,[rax+n.next] mov qword[rax+n.next],0 mov [rsp],rax mov [rsp+24],rax mov [rsp+32],rdx jmp .left .exit: add rsp,8 ret ; rdi input list, rcx count length: mov rcx,0 .L0: test rdi,rdi jz .exit mov rdi,[rdi+n.next] inc rcx jmp .L0 .exit: ret ; [rsp+8] list to sort radix_sort: sub rsp,32*8 xor ebx,ebx .L0: mov rdi,rsp mov rcx,32 xor eax,eax rep stosq mov rdi,[rsp +32*8+8] .L1: test rdi,rdi jz .next mov ecx,ebx shl ecx,2 mov esi,[rdi+n.data] shr esi,cl and esi,0fh cmp qword[rsp+rsi*8],0 je .just_set mov rdx,[rdi+n.next] mov rax,[rsp+rsi*8+16*8] mov [rax+n.next],rdi mov [rsp+rsi*8+16*8],rdi mov qword[rdi+n.next],0 mov rdi,rdx jmp .L1 .just_set: mov rdx,[rdi+n.next] mov [rsp+rsi*8],rdi mov [rsp+rsi*8+16*8],rdi mov qword[rdi+n.next],0 mov rdi,rdx jmp .L1 .next: mov qword[rsp+32*8+8],0 xor edi,edi xor ecx,ecx .L2: mov rsi,[rsp+rcx*8] .L3: test rsi,rsi jz .next1 test rdi,rdi jz .just_set1 mov [rdi+n.next],rsi mov rdi,rsi mov rdx,[rsi+n.next] mov qword[rsi+n.next],0 mov rsi,rdx jmp .L3 .just_set1: mov rdi,rsi mov [rsp+32*8+8],rsi mov rdx,[rsi+n.next] mov qword[rsi+n.next],0 mov rsi,rdx jmp .L3 .next1: inc ecx cmp ecx,16 jl .L2 inc rbx cmp rbx,8 jl .L0 .exit: add rsp,32*8 ret init_time: rdtscp shl rdx,32 or rax,rdx mov [elapsed],rax ret time_me: rdtscp shl rdx,32 or rax,rdx sub rax,[elapsed] cvtsi2sd xmm0,rax divsd xmm0,[clock] mov rax,1 jmp [printf] segment readable clock dq 3.8e9 fmtu db 'unsorted',0ah,0 fmts db 'sorted' ,0ah,0 fmte db 'equal',0ah,0 fmtne db 'not equal',0ah,0 fmtm db 'list merge elapsed %f seconds',0ah,0 fmtr db 'list radix elapsed %f seconds',0ah,0 fmtar db 'array radix elapsed %f seconds',0ah,0 fmt1 db 'seed: %d',0ah,0 fmt2 db '%16p %d',0ah,0 fmt3 db 'size of node %d, length %d',0ah,0 fmt4 db "N: %d",0xa,0 segment writeable list1 dq 0 list2 dq 0 array rq 1 elapsed rq 1 N rd 1; number of nodes seed rd 1 ;seed rd 1 i radix asm sort: Code: align 16 ; rdi array,rsi size radix_sort_avx2: push rbx push rbp mov rbp,rsp sub rsp,16+16*8+16*8 mov [rbp-8],rdi mov [rbp-16],rsi xor rbx,rbx vpxor ymm0,ymm0,ymm0 vmovdqu [rbp-(16+16*8)],ymm0 vmovdqu [rbp-(16+12*8)],ymm0 vmovdqu [rbp-(16+8*8)],ymm0 vmovdqu [rbp-(16+4*8)],ymm0 .L00: ; allocate 16 buckets lea rdi,[rbp-(16+16*8)+rbx*8] ; destination mov rdx,[rbp-16] mov rsi,16 mov rax,16*16384 cmp rdx,rax ; if array is bigger then L2 cache cmovg rsi,rax ; alignment, L2 cache size lea rdx,[rdx*4] ; size call [_posix_memalign] ; huge malloc is actually ; never allocated ; untill page is touched test rax,rax jnz .fexit inc rbx cmp rbx,16 jnz .L00 vpxor ymm0,ymm0,ymm0 vmovdqu [rbp-(16+16*8+16*8)],ymm0 vmovdqu [rbp-(16+16*8+12*8)],ymm0 vmovdqu [rbp-(16+16*8+8*8)],ymm0 vmovdqu [rbp-(16+16*8+4*8)],ymm0 xor rcx,rcx .L0: xor rdx,rdx vmovd xmm2,ecx ; vpbroadcastd ymm2,xmm1 mov rdi,[rbp-8] vpslld xmm2,xmm2,2 .L1:; calculate index with SIMD, based on [(num[rdx]>> (ecx << 2))&0xf] vmovdqu ymm1,[rdi+rdx*4] ;vpsrlvd ymm3,ymm1,ymm2 ; heh, finally got why vector shift ; is usefull vpsrld ymm3,ymm1,xmm2 vpand ymm3,ymm3,yword[mask] mov r10,8 vmovdqa ymm15,ymm3 vmovdqa ymm14,ymm1 .L22: ; this is scatter to buckets, according to dword index ; this can be unrolled, but ; I don;t know by how much performance would be increased; vmovd esi, xmm3 vmovd r11d,xmm1 mov r8,[rbp-(16+16*8)+rsi*8] ; get bucket vpsrldq xmm3,xmm3,4 mov r9,[rbp-(16+16*8+16*8)+rsi*8] ; get index into backet inc qword[rbp-(16+16*8+16*8)+rsi*8] ; update index vpsrldq xmm1,xmm1,4 mov [r8+r9*4],r11d dec r10 cmp r10,4 ; unfortunatelly vpsrldq will not shift ; upper 128 bits into lower je .adjust .L44: inc rdx cmp rdx,[rbp-16] jge .L33 test r10,r10 jnz .L22 jmp .L1 .adjust: vextracti128 xmm3,ymm15,1 vextracti128 xmm1,ymm14,1 jmp .L44 .L33: ; gather buckets (one after another) into input array xor rax,rax lea r8,[rbp-(16+16*8+16*8)] lea r9,[rbp-(16+16*8)] .L2: mov rsi,[r9] mov r10,[r8] test r10,r10 jz .L5 .L3: .L4: cmp r10,4 jl .one vmovdqa xmm15,[rsi] vmovdqu [rdi],xmm15 add rsi,16 add rdi,16 sub r10,4 jnz .L4 jmp .L5 .one: mov r11d,[rsi] mov [rdi],r11d add rsi,4 add rdi,4 dec r10 jnz .one .L5: add r8,8 add r9,8 inc rax cmp rax,16 jnz .L2 vmovdqu [rbp-(16+16*8+16*8)],ymm0 vmovdqu [rbp-(16+16*8+12*8)],ymm0 vmovdqu [rbp-(16+16*8+8*8)],ymm0 vmovdqu [rbp-(16+16*8+4*8)],ymm0 inc rcx cmp rcx,8 ; 2*(sizeof(int)) jl .L0 .exit: xor rbx,rbx .L11: mov rdi,[rbp-(16+16*8)+rbx*8] call [_free] .next: inc rbx cmp rbx,16 jnz .L11 .fexit: mov rsp,rbp pop rbp pop rbx ret rezultat: Code: ~/.../examples/sort >>> ./list_sort 1000000 seed: 1538104832 N: 1000000 unsorted 0x103ae5e30 919579 0x103ae5e10 276314 0x103ae5df0 297690 0x103ae5dd0 126247 0x103ae5db0 560392 0x103ae5d90 278668 0x103ae5d70 893887 0x103ae5d50 726189 0x103ae5d30 563301 0x103ae5d10 847808 0x103ae5cf0 32017 0x103ae5cd0 497905 0x103ae5cb0 967740 0x103ae5c90 123023 0x103ae5c70 833748 0x103ae5c50 291432 array radix elapsed 0.027579 seconds list radix elapsed 0.967346 seconds list merge elapsed 0.377042 seconds equal equal sorted 0x10392a0d0 0 0x10235cb30 1 0x102d04750 1 0x1027855d0 2 0x1033f1d30 2 0x101ca6030 6 0x101cd7c70 6 0x1030edf10 7 0x1025cf310 9 0x10203e050 10 0x1036bfa50 10 0x1032abf30 11 0x1020d5a30 12 0x1035ae550 12 0x1030fb490 13 0x102ad9490 14 size of node 12, length 1000000 sic! merge sort liste je 2 puta brzi od radix sorta liste, zahvaljujuci boljem cache lokalitiju. E sad radix sort niza je ubedljivi pobednik. Dakle rad sa listama je drasticno sporiji od nizova bez obzira sto algoritmi imaju slicnu kompleksnost zahvaljujuci tome sto je ram neverovatno spor i cache je svje i svja. |