how many 64 bit registers can I use inside intel i7 cpu for storage purposes to feed them later into XMM registers? I currently use XMM0-15, MM0-8, R8-15 only. I know i can use RAX,RBX,RCX, RDX and eight registers inside the FPU (ST0-ST8), but what others can I use? Can I use stack registers? Thanks in advance.

I attach my application code if needed.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void pipe_mult_ushort(ushort *data,ushort *rands)
__asm__ __volatile__(".intel_syntax noprefix\n\t"
		//// in this section we pull as much data as we can into the CPU 
		//// to minimize the DRAM delay and store it where we can

                "movdqa xmm0,[edi]\n\t"         // load xmm0 & xmm1
                "movdqa xmm1,[esi]\n\t"

                "movdqa xmm2,0x10[edi]\n\t"     // load xmm2 & xmm3
                "movdqa xmm3,0x10[esi]\n\t"
                "movdqa xmm4,0x20[edi]\n\t"     // load xmm4 & xmm5
                "movdqa xmm5,0x20[esi]\n\t"
                "movdqa xmm6,0x30[edi]\n\t"     // load xmm6 & xmm7
                "movdqa xmm7,0x30[esi]\n\t"
                "movdqa xmm8,0x40[edi]\n\t"     // load xmm8 & xmm9
                "movdqa xmm9,0x40[esi]\n\t"

                "movdqa xmm10,0x50[edi]\n\t"    // load xmm10 & xmm11
                "movdqa xmm11,0x50[esi]\n\t"

                "movdqa xmm12,0x60[edi]\n\t"    // load xmm12 & xmm13
                "movdqa xmm13,0x60[esi]\n\t"

                "movdqa xmm14,0x70[edi]\n\t"    // load xmm14 & xmm15
                "movdqa xmm15,0x70[esi]\n\t"

		"movq mm0,0x80[edi]\n\t"	// load mmx0  
		"movq mm1,0x80[esi]\n\t"
		"movq mm2,0x88[edi]\n\t"
		"movq mm3,0x88[esi]\n\t"
		"movq mm4,0x90[edi]\n\t"
		"movq mm5,0x90[esi]\n\t"
		"movq mm6,0x98[edi]\n\t"
		"movq mm7,0x98[esi]\n\t"

		"movq  r8,0xA0[edi]\n\t"	// store some in extended 64bit registers
		"movq  r9,0xA0[esi]\n\t"
		"movq r10,0xA8[edi]\n\t"
		"movq r11,0xA8[esi]\n\t"
		"movq r12,0xB0[edi]\n\t"
		"movq r13,0xB0[esi]\n\t"
		"movq r14,0xB8[edi]\n\t"
		"movq r15,0xB8[esi]\n\t"

		// all available registers were data can be stored were filled, proceed with calcs now
		// calc xmms first
                "pmullw xmm0,xmm1\n\t"          // calc xmm0
                "pmullw xmm2,xmm3\n\t"          // calc xmm2
                "pmullw xmm4,xmm5\n\t"          // calc xmm4
                "pmullw xmm6,xmm7\n\t"          // calc xmm6
                "pmullw xmm8,xmm9\n\t"          // calc xmm8
                "pmullw xmm10,xmm11\n\t"        // calc xmm10
                "pmullw xmm12,xmm13\n\t"        // calc xmm12
                "pmullw xmm14,xmm15\n\t"        // calc xmm14

		// calc mms second
                "pmullw mm0,mm1\n\t"        // calc mm0
                "pmullw mm2,mm3\n\t"        // calc mm0
                "pmullw mm4,mm5\n\t"        // calc mm0
                "pmullw mm6,mm7\n\t"        // calc mm0

		// send xmm values to memory
                "movdqa [edi],xmm0\n\t"         // xmm0 -> memory
                "movdqa 0x10[edi],xmm2\n\t"     // xmm2 -> memory
                "movdqa 0x20[edi],xmm4\n\t"     // xmm4 -> memory
                "movdqa 0x30[edi],xmm6\n\t"     // xmm6 -> memory
                "movdqa 0x40[edi],xmm8\n\t"     // xmm8 -> memory
                "movdqa 0x50[edi],xmm10\n\t"    // xmm10 -> memory 
                "movdqa 0x60[edi],xmm12\n\t"    // xmm12 -> memory 
                "movdqa 0x70[edi],xmm14\n\t"    // xmm14 -> memory 
		// send mm values to memory
                "movq 0x80[edi],mm0\n\t"    	// mm0 -> memory 
                "movq 0x88[edi],mm2\n\t"    	// mm2 -> memory 
                "movq 0x90[edi],mm4\n\t"    	// mm4 -> memory 
                "movq 0x98[edi],mm6\n\t"    	// mm6 -> memory 

		// xmms & mms are free now
		// load mms from 'r's
		"movq mm0,r8\n\t"		// move saved 'r' to mm
		"movq mm1,r9\n\t"		// move saved 'r' to mm
		"movq mm2,r10\n\t"		// move saved 'r' to mm
		"movq mm3,r11\n\t"		// move saved 'r' to mm
		"movq mm4,r12\n\t"		// move saved 'r' to mm
		"movq mm5,r13\n\t"		// move saved 'r' to mm
		"movq mm6,r14\n\t"		// move saved 'r' to mm
		"movq mm7,r15\n\t"		// move saved 'r' to mm
		// calc mms
                "pmullw mm0,mm1\n\t"          // calc mms copied from 'r's
                "pmullw mm2,mm3\n\t"          // calc mms copied from 'r's
                "pmullw mm4,mm5\n\t"          // calc mms copied from 'r's
                "pmullw mm6,mm7\n\t"          // calc mms copied from 'r's
		// send mm values to memory
                "movq 0xA0[edi],mm0\n\t"    	// mm0 -> memory 
                "movq 0xA8[edi],mm2\n\t"    	// mm2 -> memory 
                "movq 0xB0[edi],mm4\n\t"    	// mm4 -> memory 
                "movq 0xB8[edi],mm6\n\t"    	// mm6 -> memory 

                     :  "D" (data) ,"S" (rands)
                     :  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",


#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#include "pipe_line_math.h"

#define _ARRAY_SIZE_ 256*256*256*24
#define _ELTS_PER_PIPE_ 112
ushort __attribute__ ((aligned (16))) rands[_ARRAY_SIZE_];
ushort __attribute__ ((aligned (16))) data[_ARRAY_SIZE_];

struct timespec tspec1;
struct timespec tspec2;

main() {
ulong i,max;
double diff;

    for (i=0;i<_ARRAY_SIZE_;i++) { /// fill with any data

    for (i=0;i<max;i=i+_ELTS_PER_PIPE_) pipe_mult_ushort(&data[i],&rands[i]);   
    for (i=0;i<max;i=i+_ELTS_PER_PIPE_) pipe_mult_ushort(&data[i],&rands[i]);  // one more time
    printf("time pipeline multiply:\nstart: %d:%d\n  end: %d:%d ; total diff: %f\n",tspec1.tv_sec,tspec1.tv_nsec,tspec2.tv_sec,tspec2.tv_nsec,diff);
    printf("sample data:\n");
    for (i=0;i<64;i++) {
        if (!((i+1)%16))    printf("\n");


This article has been dead for over six months. Start a new discussion instead.