Speed Results overall: 
SSE2 /fast ----> SOR Mflops:   816.85    (100 x 100)
SSE2 /precise -> SOR Mflops:   900.03    (100 x 100)
SSE /precise --> SOR Mflops:   920.23    (100 x 100)
SSE /fast  ----> SOR Mflops:   862.56    (100 x 100)
precise	-------> SOR Mflops:   920.23    (100 x 100)	
strict ------->  SOR Mflops:   659.46    (100 x 100)
fast --------->  SOR Mflops:   862.56    (100 x 100)

*********************************************
SSE2 /fast
; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];

	movsd	xmm0, QWORD PTR [esi+edx*8]
	addsd	xmm0, QWORD PTR [eax-16]
	addsd	xmm0, QWORD PTR [ecx-16]
	movsd	xmm2, QWORD PTR [eax]
	movsd	xmm1, QWORD PTR [eax-8]
	movsd	xmm3, QWORD PTR [eax+8]
	mulsd	xmm1, xmm5
	addsd	xmm0, xmm2
	mulsd	xmm0, xmm4
	subsd	xmm0, xmm1
	movsd	QWORD PTR [eax-8], xmm0
	add	edi, eax
	movsd	xmm1, QWORD PTR [edi+ebx]
	addsd	xmm1, xmm0
	addsd	xmm1, xmm3
	addsd	xmm1, QWORD PTR [edi]
	mov	edi, DWORD PTR _Nm1$[esp+32]
	mulsd	xmm1, xmm4
	mulsd	xmm2, xmm5
	subsd	xmm1, xmm2
	movsd	xmm2, QWORD PTR [eax+16]
	movsd	QWORD PTR [eax], xmm1
	movsd	xmm0, QWORD PTR [ebx+ecx]
	addsd	xmm0, xmm1
	addsd	xmm0, QWORD PTR [ecx]
	addsd	xmm0, xmm2
	mulsd	xmm0, xmm4
	mulsd	xmm3, xmm5
	subsd	xmm0, xmm3
	movsd	QWORD PTR [eax+8], xmm0
	movsd	xmm1, QWORD PTR [esi+edx*8+24]
	addsd	xmm1, QWORD PTR [eax+24]
	addsd	xmm1, QWORD PTR [ecx+8]
	addsd	xmm1, xmm0
	mulsd	xmm1, xmm4
	mulsd	xmm2, xmm5
	subsd	xmm1, xmm2
	movsd	QWORD PTR [eax+16], xmm1
	add	edx, 4
	add	edi, -3					; fffffffdH
	add	eax, 32					; 00000020H
	add	ecx, 32					; 00000020H
	cmp	edx, edi
	jl	$LL21@SOR_execut
	mov	edi, DWORD PTR tv1904[esp+36]
	
*********************************************	
SSE2 /precise
; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];

	movsd	xmm0, QWORD PTR [esi+edx*8]
	addsd	xmm0, QWORD PTR [ecx-16]
	addsd	xmm0, QWORD PTR [eax-16]
	movsd	xmm2, QWORD PTR [eax]
	movsd	xmm1, QWORD PTR [eax-8]
	movsd	xmm3, QWORD PTR [eax+8]
	mulsd	xmm1, xmm5
	addsd	xmm0, xmm2
	mulsd	xmm0, xmm4
	addsd	xmm0, xmm1
	movsd	QWORD PTR [eax-8], xmm0
	add	edi, eax
	movsd	xmm1, QWORD PTR [edi+ebx]
	addsd	xmm1, QWORD PTR [edi]
	mov	edi, DWORD PTR _Nm1$[esp+32]
	addsd	xmm1, xmm0
	addsd	xmm1, xmm3
	mulsd	xmm1, xmm4
	mulsd	xmm2, xmm5
	addsd	xmm1, xmm2
	movsd	xmm2, QWORD PTR [eax+16]
	movsd	QWORD PTR [eax], xmm1
	movsd	xmm0, QWORD PTR [ebx+ecx]
	addsd	xmm0, QWORD PTR [ecx]
	addsd	xmm0, xmm1
	addsd	xmm0, xmm2
	mulsd	xmm0, xmm4
	mulsd	xmm3, xmm5
	addsd	xmm0, xmm3
	movsd	QWORD PTR [eax+8], xmm0
	movsd	xmm1, QWORD PTR [esi+edx*8+24]
	addsd	xmm1, QWORD PTR [ecx+8]
	addsd	xmm1, xmm0
	addsd	xmm1, QWORD PTR [eax+24]
	mulsd	xmm1, xmm4
	mulsd	xmm2, xmm5
	addsd	xmm1, xmm2
	movsd	QWORD PTR [eax+16], xmm1
	add	edx, 4
	add	edi, -3					; fffffffdH
	add	eax, 32					; 00000020H
	add	ecx, 32					; 00000020H
	cmp	edx, edi
	jl	$LL21@SOR_execut
	mov	edi, DWORD PTR tv1880[esp+36]
	
*********************************************
SSE /precise 
; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];

	fld	QWORD PTR [esi+edx*8]
	add	edi, eax
	fadd	QWORD PTR [ecx-16]
	add	edx, 4
	add	eax, 32					; 00000020H
	add	ecx, 32					; 00000020H
	fadd	QWORD PTR [eax-48]
	fadd	QWORD PTR [eax-32]
	fmul	ST(0), ST(1)
	fld	QWORD PTR [eax-40]
	fmul	ST(0), ST(3)
	faddp	ST(1), ST(0)
	fst	QWORD PTR [eax-40]
	fld	QWORD PTR [edi+ebx]
	fadd	QWORD PTR [edi]
	mov	edi, DWORD PTR _Nm1$[esp+32]
	add	edi, -3					; fffffffdH
	cmp	edx, edi
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [eax-24]
	fmul	ST(0), ST(1)
	fld	QWORD PTR [eax-32]
	fmul	ST(0), ST(3)
	faddp	ST(1), ST(0)
	fst	QWORD PTR [eax-32]
	fld	QWORD PTR [ebx+ecx-32]
	fadd	QWORD PTR [ecx-32]
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [eax-16]
	fmul	ST(0), ST(1)
	fld	ST(2)
	fmul	QWORD PTR [eax-24]
	faddp	ST(1), ST(0)
	fst	QWORD PTR [eax-24]
	fld	QWORD PTR [esi+edx*8-8]
	fadd	QWORD PTR [ecx-24]
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [eax-8]
	fmul	ST(0), ST(1)
	fld	QWORD PTR [eax-16]
	fmul	ST(0), ST(3)
	faddp	ST(1), ST(0)
	fstp	QWORD PTR [eax-16]
	jl	SHORT $LN21@SOR_execut
	mov	edi, DWORD PTR tv1880[esp+36]
*********************************************
SSE /fast
; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];

	fld	QWORD PTR [esi+edx*8]
	add	edi, eax
	fadd	QWORD PTR [eax-16]
	add	edx, 4
	add	eax, 32					; 00000020H
	add	ecx, 32					; 00000020H
	fadd	QWORD PTR [ecx-48]
	fadd	QWORD PTR [eax-32]
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-40]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fst	QWORD PTR [eax-40]
	fadd	QWORD PTR [edi+ebx]
	fadd	QWORD PTR [eax-24]
	fadd	QWORD PTR [edi]
	mov	edi, DWORD PTR _Nm1$[esp+40]
	add	edi, -3					; fffffffdH
	cmp	edx, edi
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-32]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fst	QWORD PTR [eax-32]
	fld	QWORD PTR [ebx+ecx-32]
	fadd	QWORD PTR [eax-16]
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [ecx-32]
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-24]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fst	QWORD PTR [eax-24]
	fld	QWORD PTR [esi+edx*8-8]
	fadd	QWORD PTR [eax-8]
	fadd	QWORD PTR [ecx-24]
	faddp	ST(1), ST(0)
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-16]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fstp	QWORD PTR [eax-16]
	jl	SHORT $LN21@SOR_execut
	mov	edi, DWORD PTR tv1904[esp+40]
*********************************************
; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];

	fld	QWORD PTR [esi+edx*8]
	add	edi, eax
	fadd	QWORD PTR [ecx-16]
	add	edx, 4
	add	eax, 32					; 00000020H
	add	ecx, 32					; 00000020H
	fadd	QWORD PTR [eax-48]
	fadd	QWORD PTR [eax-32]
	fmul	ST(0), ST(1)
	fld	QWORD PTR [eax-40]
	fmul	ST(0), ST(3)
	faddp	ST(1), ST(0)
	fst	QWORD PTR [eax-40]
	fld	QWORD PTR [edi+ebx]
	fadd	QWORD PTR [edi]
	mov	edi, DWORD PTR _Nm1$[esp+32]
	add	edi, -3					; fffffffdH
	cmp	edx, edi
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [eax-24]
	fmul	ST(0), ST(1)
	fld	QWORD PTR [eax-32]
	fmul	ST(0), ST(3)
	faddp	ST(1), ST(0)
	fst	QWORD PTR [eax-32]
	fld	QWORD PTR [ebx+ecx-32]
	fadd	QWORD PTR [ecx-32]
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [eax-16]
	fmul	ST(0), ST(1)
	fld	ST(2)
	fmul	QWORD PTR [eax-24]
	faddp	ST(1), ST(0)
	fst	QWORD PTR [eax-24]
	fld	QWORD PTR [esi+edx*8-8]
	fadd	QWORD PTR [ecx-24]
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [eax-8]
	fmul	ST(0), ST(1)
	fld	QWORD PTR [eax-16]
	fmul	ST(0), ST(3)
	faddp	ST(1), ST(0)
	fstp	QWORD PTR [eax-16]
	jl	SHORT $LN21@SOR_execut
	mov	edi, DWORD PTR tv1880[esp+36]
	
*********************************************
strict
	; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];
	
		fld	QWORD PTR _omega_over_four$[esp+60]
		mov	ebp, edi
		fld	QWORD PTR _one_minus_omega$[esp+60]
		sub	ebp, ebx
		lea	eax, DWORD PTR [ebx+16]
		mov	ebx, esi
		lea	ecx, DWORD PTR [edi+24]
		mov	DWORD PTR tv1141[esp+60], ebp
		sub	ebx, edi
	$LN20@SOR_execut:
		fld	QWORD PTR [esi+edx*8]
		add	edx, 4
		fadd	QWORD PTR [ecx-16]
		add	ecx, 32					; 00000020H
		fadd	QWORD PTR [eax-16]
		fadd	QWORD PTR [eax]
		fmul	ST(0), ST(2)
		fld	QWORD PTR [eax-8]
		fmul	ST(0), ST(2)
		faddp	ST(1), ST(0)
		fstp	QWORD PTR tv1841[esp+60]
		mov	edi, DWORD PTR tv1841[esp+60]
		mov	ebp, DWORD PTR tv1841[esp+64]
		mov	DWORD PTR [eax-8], edi
		mov	edi, DWORD PTR tv1141[esp+60]
		mov	DWORD PTR [eax-4], ebp
		add	edi, eax
		fld	QWORD PTR [edi+ebx]
		add	eax, 32					; 00000020H
		fadd	QWORD PTR [edi]
		fadd	QWORD PTR tv1841[esp+60]
		fadd	QWORD PTR [eax-24]
		fmul	ST(0), ST(2)
		fld	QWORD PTR [eax-32]
		fmul	ST(0), ST(2)
		faddp	ST(1), ST(0)
		fstp	QWORD PTR tv1439[esp+60]
		mov	edi, DWORD PTR tv1439[esp+60]
		mov	ebp, DWORD PTR tv1439[esp+64]
		mov	DWORD PTR [eax-32], edi
		mov	DWORD PTR [eax-28], ebp
		fld	QWORD PTR [ebx+ecx-32]
		fadd	QWORD PTR [ecx-32]
		fadd	QWORD PTR tv1439[esp+60]
		fadd	QWORD PTR [eax-16]
		fmul	ST(0), ST(2)
		fld	ST(1)
		fmul	QWORD PTR [eax-24]
		faddp	ST(1), ST(0)
		fstp	QWORD PTR tv1456[esp+60]
		mov	edi, DWORD PTR tv1456[esp+60]
		mov	ebp, DWORD PTR tv1456[esp+64]
		mov	DWORD PTR [eax-24], edi
		mov	DWORD PTR [eax-20], ebp
		fld	QWORD PTR [esi+edx*8-8]
		fadd	QWORD PTR [ecx-24]
		mov	ebp, DWORD PTR _Nm1$[esp+60]
		lea	edi, DWORD PTR [ebp-3]
		cmp	edx, edi
		fadd	QWORD PTR tv1456[esp+60]
		fadd	QWORD PTR [eax-8]
		fmul	ST(0), ST(2)
		fld	QWORD PTR [eax-16]
		fmul	ST(0), ST(2)
		faddp	ST(1), ST(0)
		fstp	QWORD PTR [eax-16]
		jl	$LN20@SOR_execut
		mov	ecx, DWORD PTR tv1880[esp+56]
		fstp	ST(1)
		fstp	ST(0)
	$LC19@SOR_execut:
	
	; 37   :                 for (j=1; j<Nm1; j++)
	
		cmp	edx, ebp
		jge	SHORT $LN5@SOR_execut
		mov	edi, DWORD PTR [ecx-4]
		fld	QWORD PTR _omega_over_four$[esp+60]
		mov	ebx, DWORD PTR [ecx-8]
		fld	QWORD PTR _one_minus_omega$[esp+60]
		sub	esi, ebx
		lea	eax, DWORD PTR [edi+edx*8]
		sub	ebx, edi
		mov	edi, ebp
		sub	edi, edx
	$LC3@SOR_execut:
	
	; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];
	
		lea	edx, DWORD PTR [eax+ebx]
		fld	QWORD PTR [edx+esi]
		add	eax, 8
		sub	edi, 1
		fadd	QWORD PTR [edx]
		fadd	QWORD PTR [eax-16]
		fadd	QWORD PTR [eax]
		fmul	ST(0), ST(2)
		fld	QWORD PTR [eax-8]
		fmul	ST(0), ST(2)
		faddp	ST(1), ST(0)
		fstp	QWORD PTR [eax-8]
		jne	SHORT $LC3@SOR_execut
		fstp	ST(1)
		fstp	ST(0)
	$LN5@SOR_execut:
		add	ecx, 4
		sub	DWORD PTR tv1929[esp+56], 1
		mov	DWORD PTR tv1880[esp+56], ecx
		jne	$LL6@SOR_execut

*********************************************
fast
; 38   :                     Gi[j] = omega_over_four * (Gim1[j] + Gip1[j] + Gi[j-1] + Gi[j+1]) + one_minus_omega * Gi[j];

	fld	QWORD PTR [esi+edx*8]
	add	edi, eax
	fadd	QWORD PTR [eax-16]
	add	edx, 4
	add	eax, 32					; 00000020H
	add	ecx, 32					; 00000020H
	fadd	QWORD PTR [ecx-48]
	fadd	QWORD PTR [eax-32]
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-40]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fst	QWORD PTR [eax-40]
	fadd	QWORD PTR [edi+ebx]
	fadd	QWORD PTR [eax-24]
	fadd	QWORD PTR [edi]
	mov	edi, DWORD PTR _Nm1$[esp+40]
	add	edi, -3					; fffffffdH
	cmp	edx, edi
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-32]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fst	QWORD PTR [eax-32]
	fld	QWORD PTR [ebx+ecx-32]
	fadd	QWORD PTR [eax-16]
	faddp	ST(1), ST(0)
	fadd	QWORD PTR [ecx-32]
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-24]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fst	QWORD PTR [eax-24]
	fld	QWORD PTR [esi+edx*8-8]
	fadd	QWORD PTR [eax-8]
	fadd	QWORD PTR [ecx-24]
	faddp	ST(1), ST(0)
	fmul	ST(0), ST(2)
	fld	QWORD PTR [eax-16]
	fmul	ST(0), ST(2)
	fsubp	ST(1), ST(0)
	fstp	QWORD PTR [eax-16]
	jl	SHORT $LN21@SOR_execut
	mov	edi, DWORD PTR tv1904[esp+40]