/* compile with "gcc -O2 -fno-inline-function -o add add.c" */ /* add -DEINS, -DZWEI etc (see below for #ifdef and #elif) */ #include #define rdtscl(low) \ __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx") static void load64(unsigned long* a) { asm("mov (%%rsi),%%rax\r\n" "mov 8(%%rsi),%%rax\r\n" "mov 16(%%rsi),%%rax\r\n" "mov 24(%%rsi),%%rax\r\n" "mov 32(%%rsi),%%rax\r\n" "mov 40(%%rsi),%%rax\r\n" "mov 48(%%rsi),%%rax\r\n" "mov 56(%%rsi),%%rax\r\n" "mov 64(%%rsi),%%rax\r\n" "mov 72(%%rsi),%%rax\r\n" "mov 80(%%rsi),%%rax\r\n" "mov 88(%%rsi),%%rax\r\n" "mov 96(%%rsi),%%rax\r\n" "mov 104(%%rsi),%%rax\r\n" "mov 112(%%rsi),%%rax\r\n" "mov 120(%%rsi),%%rax\r\n" : : "S"(a)); } static void store64(unsigned long* a) { asm("mov %%rax,(%%rsi)\r\n" "mov %%rax,8(%%rsi)\r\n" "mov %%rax,16(%%rsi)\r\n" "mov %%rax,24(%%rsi)\r\n" "mov %%rax,32(%%rsi)\r\n" "mov %%rax,40(%%rsi)\r\n" "mov %%rax,48(%%rsi)\r\n" "mov %%rax,56(%%rsi)\r\n" "mov %%rax,64(%%rsi)\r\n" "mov %%rax,72(%%rsi)\r\n" "mov %%rax,80(%%rsi)\r\n" "mov %%rax,88(%%rsi)\r\n" "mov %%rax,96(%%rsi)\r\n" "mov %%rax,104(%%rsi)\r\n" "mov %%rax,112(%%rsi)\r\n" "mov %%rax,120(%%rsi)\r\n" : : "S"(a)); } static void adc64(unsigned long* a,unsigned long* b,unsigned long* c) { #if 0 load64(a); load64(b); asm("adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" "adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" "adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" "adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" "adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" "adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" "adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" "adc %%rdx,%%rax\r\n" "adc %%rdx,%%rcx\r\n" : : : "rax","rcx"); store64(c); #else asm( #if defined(EINS) "mov 0(%%rsi),%%rax\n\t" "adc 0(%%rdi),%%rax\n\t" "mov 8(%%rsi),%%r8\n\t" "adc 8(%%rdi),%%r8\n\t" "mov 16(%%rsi),%%r9\n\t" "adc 16(%%rdi),%%r9\n\t" "mov 24(%%rsi),%%r10\n\t" "adc 24(%%rdi),%%r10\n\t" "mov %%rax, 0(%%rbx)\n\t" "mov %%r8, 8(%%rbx)\n\t" "mov %%r9, 16(%%rbx)\n\t" "mov %%r10, 24(%%rbx)\n\t" "mov 32(%%rsi),%%rax\n\t" "adc 32(%%rdi),%%rax\n\t" "mov 40(%%rsi),%%r8\n\t" "adc 40(%%rdi),%%r8\n\t" "mov 48(%%rsi),%%r9\n\t" "adc 48(%%rdi),%%r9\n\t" "mov 56(%%rsi),%%r10\n\t" "adc 56(%%rdi),%%r10\n\t" "mov %%rax, 32(%%rbx)\n\t" "mov %%r8, 40(%%rbx)\n\t" "mov %%r9, 48(%%rbx)\n\t" "mov %%r10, 56(%%rbx)\n\t" "mov 64(%%rsi),%%rax\n\t" "adc 64(%%rdi),%%rax\n\t" "mov 72(%%rsi),%%r8\n\t" "adc 72(%%rdi),%%r8\n\t" "mov 80(%%rsi),%%r9\n\t" "adc 80(%%rdi),%%r9\n\t" "mov 88(%%rsi),%%r10\n\t" "adc 88(%%rdi),%%r10\n\t" "mov %%rax, 64(%%rbx)\n\t" "mov %%r8, 72(%%rbx)\n\t" "mov %%r9, 80(%%rbx)\n\t" "mov %%r10, 88(%%rbx)\n\t" "mov 96(%%rsi),%%rax\n\t" "adc 96(%%rdi),%%rax\n\t" "mov 104(%%rsi),%%r8\n\t" "adc 104(%%rdi),%%r8\n\t" "mov 112(%%rsi),%%r9\n\t" "adc 112(%%rdi),%%r9\n\t" "mov 120(%%rsi),%%r10\n\t" "adc 120(%%rdi),%%r10\n\t" "mov %%rax, 96(%%rbx)\n\t" "mov %%r8, 104(%%rbx)\n\t" "mov %%r9, 112(%%rbx)\n\t" "mov %%r10, 120(%%rbx)\n\t" #elif defined(ZWEI) "mov 0(%%rsi),%%rax\n\t" "adc 0(%%rdi),%%rax\n\t" "mov %%rax, 0(%%rbx)\n\t" "mov 8(%%rsi),%%r8\n\t" "adc 8(%%rdi),%%r8\n\t" "mov %%r8, 8(%%rbx)\n\t" "mov 16(%%rsi),%%r9\n\t" "adc 16(%%rdi),%%r9\n\t" "mov %%r9, 16(%%rbx)\n\t" "mov 24(%%rsi),%%r10\n\t" "adc 24(%%rdi),%%r10\n\t" "mov %%r10, 24(%%rbx)\n\t" "mov 32(%%rsi),%%rax\n\t" "adc 32(%%rdi),%%rax\n\t" "mov %%rax, 32(%%rbx)\n\t" "mov 40(%%rsi),%%r8\n\t" "adc 40(%%rdi),%%r8\n\t" "mov %%r8, 40(%%rbx)\n\t" "mov 48(%%rsi),%%r9\n\t" "adc 48(%%rdi),%%r9\n\t" "mov %%r9, 48(%%rbx)\n\t" "mov 56(%%rsi),%%r10\n\t" "adc 56(%%rdi),%%r10\n\t" "mov %%r10, 56(%%rbx)\n\t" "mov 64(%%rsi),%%rax\n\t" "adc 64(%%rdi),%%rax\n\t" "mov %%rax, 64(%%rbx)\n\t" "mov 72(%%rsi),%%r8\n\t" "adc 72(%%rdi),%%r8\n\t" "mov %%r8, 72(%%rbx)\n\t" "mov 80(%%rsi),%%r9\n\t" "adc 80(%%rdi),%%r9\n\t" "mov %%r9, 80(%%rbx)\n\t" "mov 88(%%rsi),%%r10\n\t" "adc 88(%%rdi),%%r10\n\t" "mov %%r10, 88(%%rbx)\n\t" "mov 96(%%rsi),%%rax\n\t" "adc 96(%%rdi),%%rax\n\t" "mov %%rax, 96(%%rbx)\n\t" "mov 104(%%rsi),%%r8\n\t" "adc 104(%%rdi),%%r8\n\t" "mov %%r8, 104(%%rbx)\n\t" "mov 112(%%rsi),%%r9\n\t" "adc 112(%%rdi),%%r9\n\t" "mov %%r9, 112(%%rbx)\n\t" "mov 120(%%rsi),%%r10\n\t" "adc 120(%%rdi),%%r10\n\t" "mov %%r10, 120(%%rbx)\n\t" #elif defined(DREI) "mov 0(%%rsi),%%rax\n\t" "adc 0(%%rdi),%%rax\n\t" "mov %%rax, 0(%%rbx)\n\t" "mov 8(%%rsi),%%rax\n\t" "adc 8(%%rdi),%%rax\n\t" "mov %%rax, 8(%%rbx)\n\t" "mov 16(%%rsi),%%rax\n\t" "adc 16(%%rdi),%%rax\n\t" "mov %%rax, 16(%%rbx)\n\t" "mov 24(%%rsi),%%rax\n\t" "adc 24(%%rdi),%%rax\n\t" "mov %%rax, 24(%%rbx)\n\t" "mov 32(%%rsi),%%rax\n\t" "adc 32(%%rdi),%%rax\n\t" "mov %%rax, 32(%%rbx)\n\t" "mov 40(%%rsi),%%rax\n\t" "adc 40(%%rdi),%%rax\n\t" "mov %%rax, 40(%%rbx)\n\t" "mov 48(%%rsi),%%rax\n\t" "adc 48(%%rdi),%%rax\n\t" "mov %%rax, 48(%%rbx)\n\t" "mov 56(%%rsi),%%rax\n\t" "adc 56(%%rdi),%%rax\n\t" "mov %%rax, 56(%%rbx)\n\t" "mov 64(%%rsi),%%rax\n\t" "adc 64(%%rdi),%%rax\n\t" "mov %%rax, 64(%%rbx)\n\t" "mov 72(%%rsi),%%rax\n\t" "adc 72(%%rdi),%%rax\n\t" "mov %%rax, 72(%%rbx)\n\t" "mov 80(%%rsi),%%rax\n\t" "adc 80(%%rdi),%%rax\n\t" "mov %%rax, 80(%%rbx)\n\t" "mov 88(%%rsi),%%rax\n\t" "adc 88(%%rdi),%%rax\n\t" "mov %%rax, 88(%%rbx)\n\t" "mov 96(%%rsi),%%rax\n\t" "adc 96(%%rdi),%%rax\n\t" "mov %%rax, 96(%%rbx)\n\t" "mov 104(%%rsi),%%rax\n\t" "adc 104(%%rdi),%%rax\n\t" "mov %%rax, 104(%%rbx)\n\t" "mov 112(%%rsi),%%rax\n\t" "adc 112(%%rdi),%%rax\n\t" "mov %%rax, 112(%%rbx)\n\t" "mov 120(%%rsi),%%rax\n\t" "adc 120(%%rdi),%%rax\n\t" "mov %%rax, 120(%%rbx)\n\t" #elif defined(VIER) "mov $-16,%%rcx\n\t" "lea 128(%%rsi),%%rsi\n\t" "lea 128(%%rdi),%%rdi\n\t" "lea 128(%%rbx),%%rbx\n\t" "1:\n\t" "mov (%%rsi,%%rcx,8),%%rax\n\t" "adc (%%rdi,%%rcx,8),%%rax\n\t" "mov %%rax, (%%rbx,%%rcx,8)\n\t" "inc %%rcx\n\t" "jnz 1b\n\t" #else "mov $16,%%rcx\n\t" "xor %%rdx,%%rdx\n\t" "1:\n\t" "mov (%%rsi),%%rax\n\t" "adc (%%rdi),%%rax\n\t" "mov %%rax, (%%rbx)\n\t" "lea 8(%%rsi),%%rsi\n\t" "lea 8(%%rdi),%%rdi\n\t" "lea 8(%%rbx),%%rbx\n\t" "dec %%rcx\n\t" "jnz 1b\n\t" #endif : : "S"(b), "D"(a), "b"(c): "rax","rdx"); #endif } static void load32(unsigned int* a) { asm("mov (%%rsi),%%eax\r\n" "mov 4(%%rsi),%%eax\r\n" "mov 8(%%rsi),%%eax\r\n" "mov 12(%%rsi),%%eax\r\n" "mov 16(%%rsi),%%eax\r\n" "mov 20(%%rsi),%%eax\r\n" "mov 24(%%rsi),%%eax\r\n" "mov 28(%%rsi),%%eax\r\n" "mov 32(%%rsi),%%eax\r\n" "mov 36(%%rsi),%%eax\r\n" "mov 40(%%rsi),%%eax\r\n" "mov 44(%%rsi),%%eax\r\n" "mov 48(%%rsi),%%eax\r\n" "mov 52(%%rsi),%%eax\r\n" "mov 56(%%rsi),%%eax\r\n" "mov 60(%%rsi),%%eax\r\n" : : "S"(a)); } static void store32(unsigned int* a) { asm("mov %%eax,(%%rsi)\r\n" "mov %%eax,4(%%rsi)\r\n" "mov %%eax,8(%%rsi)\r\n" "mov %%eax,12(%%rsi)\r\n" "mov %%eax,16(%%rsi)\r\n" "mov %%eax,20(%%rsi)\r\n" "mov %%eax,24(%%rsi)\r\n" "mov %%eax,28(%%rsi)\r\n" "mov %%eax,32(%%rsi)\r\n" "mov %%eax,36(%%rsi)\r\n" "mov %%eax,40(%%rsi)\r\n" "mov %%eax,44(%%rsi)\r\n" "mov %%eax,48(%%rsi)\r\n" "mov %%eax,52(%%rsi)\r\n" "mov %%eax,56(%%rsi)\r\n" "mov %%eax,60(%%rsi)\r\n" : : "S"(a)); } static void adc32(unsigned int* a,unsigned int* b,unsigned int* c) { load32(a); load32(b); asm("adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" "adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" "adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" "adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" "adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" "adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" "adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" "adc %%edx,%%eax\r\n" "adc %%edx,%%ecx\r\n" ::: "eax","ecx"); store32(c); } int main() { unsigned long a64[32]; unsigned long b64[32]; unsigned long c64[32]; unsigned int a32[32]; unsigned int b32[32]; unsigned int c32[32]; long a,b,d,i; adc64(a64,b64,c64); rdtscl(a); rdtscl(b); d=b-a; rdtscl(a); adc64(a64,b64,c64); rdtscl(b); printf("64-bit: %lu\n",b-a-d); adc32(a32,b32,c32); rdtscl(a); adc32(a32,b32,c32); rdtscl(b); printf("32-bit: %lu\n",b-a-d); }