

#ifdef SUBROUTINE22

#if COMPLEX == 0

/* Real matmul() */

g95_array_descriptor *SUBROUTINE12(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a, ext_b1, ext_b2, dp, dq;
g95_array_descriptor *r;
char *p, *q, *t;
int temp;

    ext_a = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a < 0)
	ext_a = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(1, REAL10_SIZE, ext_b2);
    temp = 0;

    /* r(j) = \sum_i matrix_a(i) * matrix_b(i,j) */

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	asm("fldz");

	p = matrix_a->offset +
	    matrix_a->info[0].mult * matrix_a->info[0].lbound;

	q = matrix_b->offset +
	    matrix_b->info[0].mult * matrix_b->info[0].lbound +
	    matrix_b->info[1].mult*(j + matrix_b->info[1].lbound);

	for(i=0; i<ext_a; i++) {
	    asm(LOAD_A);
	    asm(LOAD_B);
      
	    asm("fmulp %st(1)\n"
		"faddp %st(1)\n");

	    p += dp;
	    q += dq;
	}

	t = r->base + j*REAL10_SIZE;
	asm("mov %0, %" EAX "\n"
	    "fstpt (%" EAX ")\n" : : "m" (t) : EAX);
    }

    return r;
}


g95_array_descriptor *SUBROUTINE21(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a1, ext_a2, ext_b, dp, dq;
char *p, *q, *result, *t;
g95_array_descriptor *r;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b < 0)
	ext_b = 0;

    if (ext_a2 != ext_b)
	runtime_error(matrix_mismatch);

    r = temp_array(1, REAL10_SIZE, ext_a1);
    result = r->base;
    temp = 0;

    /* r(i) = \sum_j matrix_a(i,j) * matrix_b(j) */

    /* Sum things such that we traverse matrix A in ascending memory
     * locations, assuming it is contiguous.  This improves performance. */

    memset(result, '\0', REAL10_SIZE*ext_a1);

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    q = matrix_b->offset + matrix_b->info[0].mult*matrix_b->info[0].lbound;

    for(i=0; i<ext_a2; i++) {
	p = matrix_a->offset + matrix_a->info[0].mult*matrix_a->info[0].lbound
	    + matrix_a->info[1].mult * (i+matrix_a->info[1].lbound);

	for(j=0; j<ext_a1; j++) {
	    t = result + j*REAL10_SIZE;

	    asm("mov %0, %" EAX "\n"
		"fldt (%" EAX ")\n" : : "m" (t) : EAX );

	    asm(LOAD_A);
	    asm(LOAD_B);

	    asm("fmulp %%st(1)\n"
		"faddp %%st(1)\n"
		"mov %0, %" EAX "\n"
		"fstpt (%" EAX ")\n" : : "m" (t) : EAX);

	    p += dp;
	}

	q += dq;
    }

    return r;
}


g95_array_descriptor *SUBROUTINE22(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, k, ext_a1, ext_a2, ext_b1, ext_b2, dp;
g95_array_descriptor *r;
char *m, *p, *q;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a2 != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(2, REAL10_SIZE, ext_a1, ext_b2);
    temp = 0;

    /* r(i,j) = \sum_k matrix_a(i,k) * matrix_b(k,j) */

    memset(r->base, '\0', REAL10_SIZE * ext_a1 * ext_b2);

    dp = matrix_a->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	for(k=0; k<ext_b1; k++) {
	    m = r->offset +
		r->info[0].mult*r->info[0].lbound +
		r->info[1].mult*(r->info[1].lbound + j);

	    q = matrix_b->offset +
		matrix_b->info[0].mult * (k + matrix_b->info[0].lbound) +
		matrix_b->info[1].mult * (j + matrix_b->info[1].lbound);

	    p = matrix_a->offset +
		matrix_a->info[0].mult * matrix_a->info[0].lbound +
		matrix_a->info[1].mult * (k + matrix_a->info[1].lbound);

	    for(i=0; i<ext_a1; i++) {
		asm("mov %0, %" EAX "\n"
		    "fldt (%" EAX ")\n" : : "m" (m) : EAX );

		asm(LOAD_A);
		asm(LOAD_B);

		asm("fmulp %%st(1)\n"
		    "faddp %%st(1)\n"
		    "mov %0, %" EAX "\n"
		    "fstpt (%" EAX ")\n" : : "m" (m) : EAX);

		p += dp;
		m += REAL10_SIZE;
	    }
	}
    }

    return r;
}



#elif COMPLEX == 1

/* Complex from complex/real */

g95_array_descriptor *SUBROUTINE12(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a, ext_b1, ext_b2, dp, dq;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a < 0)
	ext_a = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(1, 2*REAL10_SIZE, ext_b2);
    temp = 0;

    /* r(j) = \sum_i matrix_a(i) * matrix_b(i,j) */

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	asm("fldz\n"
	    "fldz\n");

	p = matrix_a->offset +
	    matrix_a->info[0].mult * matrix_a->info[0].lbound;

	q = matrix_b->offset +
	    matrix_b->info[0].mult * matrix_b->info[0].lbound +
	    matrix_b->info[1].mult*(j + matrix_b->info[1].lbound);

	for(i=0; i<ext_a; i++) {
	    asm(LOAD_A);
	    asm(LOAD_B);
      
	    asm("fxch %st(2)\n"
		"fmul %st(2)\n"
		"fadd %st(4)\n"
		"fstp %st(4)\n"
		"fmulp %st(1)\n"
		"faddp %st(1)\n");

	    p += dp;
	    q += dq;
	}

	product = r->base + 2*REAL10_SIZE*j;
	asm(RESULT);
    }

    return r;
}


g95_array_descriptor *SUBROUTINE21(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a1, ext_a2, ext_b, dp, dq;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b < 0)
	ext_b = 0;

    if (ext_a2 != ext_b)
	runtime_error(matrix_mismatch);

    r = temp_array(1, matrix_a->element_size, ext_a1);
    temp = 0;

    /* r(i) = \sum_j matrix_a(i,j) * matrix_b(j) */

    /* Sum things such that we traverse matrix A in ascending memory
     * locations, assuming it is contiguous.  This improves performance. */

    memset(r->base, '\0', r->element_size*ext_a1);

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    q = matrix_b->offset + matrix_b->info[0].mult*matrix_b->info[0].lbound;

    for(i=0; i<ext_a2; i++) {
	p = matrix_a->offset + matrix_a->info[0].mult*matrix_a->info[0].lbound
	    + matrix_a->info[1].mult * (i+matrix_a->info[1].lbound);

	for(j=0; j<ext_a1; j++) {
	    product = r->base + j*r->element_size;

	    asm(LOAD_RESULT);

	    asm(LOAD_A);
	    asm(LOAD_B);

	    asm("fxch %st(2)\n"
		"fmul %st(2)\n"
		"fadd %st(4)\n"
		"fstp %st(4)\n"
		"fmulp %st(1)\n"
		"faddp %st(1)\n");

	    asm(RESULT);

	    p += dp;
	}

	q += dq;
    }

    return r;
}


g95_array_descriptor *SUBROUTINE22(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, k, ext_a1, ext_a2, ext_b1, ext_b2, dp;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a2 != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(2, matrix_a->element_size, ext_a1, ext_b2);
    temp = 0;

    /* r(i,j) = \sum_k matrix_a(i,k) * matrix_b(k,j) */

    memset(r->base, '\0', matrix_a->element_size * ext_a1 * ext_b2);

    dp = matrix_a->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	for(k=0; k<ext_b1; k++) {
	    product = r->offset +
		r->info[0].mult*r->info[0].lbound +
		r->info[1].mult*(r->info[1].lbound + j);

	    q = matrix_b->offset +
		matrix_b->info[0].mult * (k + matrix_b->info[0].lbound) +
		matrix_b->info[1].mult * (j + matrix_b->info[1].lbound);

	    p = matrix_a->offset +
		matrix_a->info[0].mult * matrix_a->info[0].lbound +
		matrix_a->info[1].mult * (k + matrix_a->info[1].lbound);

	    for(i=0; i<ext_a1; i++) {
		asm(LOAD_RESULT);

		asm(LOAD_A);
		asm(LOAD_B);

		asm("fxch %st(2)\n"
		    "fmul %st(2)\n"
		    "fadd %st(4)\n"
		    "fstp %st(4)\n"
		    "fmulp %st(1)\n"
		    "faddp %st(1)\n");

		asm(RESULT);

		p += dp;

		product += matrix_a->element_size;
	    }
	}
    }

    return r;
}


#elif COMPLEX == 2

/* Complex from real/complex */

g95_array_descriptor *SUBROUTINE12(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a, ext_b1, ext_b2, dp, dq;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a < 0)
	ext_a = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(1, 2*REAL10_SIZE, ext_b2);
    temp = 0;

    /* r(j) = \sum_i matrix_a(i) * matrix_b(i,j) */

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	asm("fldz\n"
	    "fldz\n");

	p = matrix_a->offset +
	    matrix_a->info[0].mult * matrix_a->info[0].lbound;

	q = matrix_b->offset +
	    matrix_b->info[0].mult * matrix_b->info[0].lbound +
	    matrix_b->info[1].mult*(j + matrix_b->info[1].lbound);

	for(i=0; i<ext_a; i++) {
	    asm(LOAD_B);
	    asm(LOAD_A);
      
	    asm("fxch %st(2)\n"
		"fmul %st(2)\n"
		"fadd %st(4)\n"
		"fstp %st(4)\n"
		"fmulp %st(1)\n"
		"faddp %st(1)\n");

	    p += dp;
	    q += dq;
	}

	product = r->base + 2*REAL10_SIZE*j;
	asm(RESULT);
    }

    return r;
}


g95_array_descriptor *SUBROUTINE21(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a1, ext_a2, ext_b, dp, dq;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b < 0)
	ext_b = 0;

    if (ext_a2 != ext_b)
	runtime_error(matrix_mismatch);

    r = temp_array(1, matrix_a->element_size, ext_a1);
    temp = 0;

    /* r(i) = \sum_j matrix_a(i,j) * matrix_b(j) */

    /* Sum things such that we traverse matrix A in ascending memory
     * locations, assuming it is contiguous.  This improves performance. */

    memset(r->base, '\0', r->element_size*ext_a1);

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    q = matrix_b->offset + matrix_b->info[0].mult*matrix_b->info[0].lbound;

    for(i=0; i<ext_a2; i++) {
	p = matrix_a->offset +
	    matrix_a->info[0].mult*matrix_a->info[0].lbound +
	    matrix_a->info[1].mult * (i+matrix_a->info[1].lbound);

	for(j=0; j<ext_a1; j++) {
	    product = r->base + j*r->element_size;

	    asm(LOAD_RESULT);

	    asm(LOAD_B);
	    asm(LOAD_A);

	    asm("fxch %st(2)\n"
		"fmul %st(2)\n"
		"fadd %st(4)\n"
		"fstp %st(4)\n"
		"fmulp %st(1)\n"
		"faddp %st(1)\n");

	    asm(RESULT);

	    p += dp;
	}

	q += dq;
    }

    return r;
}


g95_array_descriptor *SUBROUTINE22(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, k, ext_a1, ext_a2, ext_b1, ext_b2, dp;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a2 != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(2, matrix_a->element_size, ext_a1, ext_b2);
    temp = 0;

    /* r(i,j) = \sum_k matrix_a(i,k) * matrix_b(k,j) */

    memset(r->base, '\0', matrix_a->element_size * ext_a1 * ext_b2);

    dp = matrix_a->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	for(k=0; k<ext_b1; k++) {
	    product = r->offset +
		r->info[0].mult*r->info[0].lbound +
		r->info[1].mult*(r->info[1].lbound + j);

	    q = matrix_b->offset +
		matrix_b->info[0].mult * (k + matrix_b->info[0].lbound) +
		matrix_b->info[1].mult * (j + matrix_b->info[1].lbound);

	    p = matrix_a->offset +
		matrix_a->info[0].mult * matrix_a->info[0].lbound +
		matrix_a->info[1].mult * (k + matrix_a->info[1].lbound);

	    for(i=0; i<ext_a1; i++) {
		asm(LOAD_RESULT);

		asm(LOAD_B);
		asm(LOAD_A);

		asm("fxch %st(2)\n"
		    "fmul %st(2)\n"
		    "fadd %st(4)\n"
		    "fstp %st(4)\n"
		    "fmulp %st(1)\n"
		    "faddp %st(1)\n");

		asm(RESULT);

		p += dp;

		product += matrix_a->element_size;
	    }
	}
    }

    return r;
}


#elif COMPLEX == 3

/* Complex from complex/complex */

g95_array_descriptor *SUBROUTINE12(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a, ext_b1, ext_b2, dp, dq;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a < 0)
	ext_a = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(1, 2*REAL10_SIZE, ext_b2);
    temp = 0;

    /* r(j) = \sum_i matrix_a(i) * matrix_b(i,j) */

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	asm("fldz\n"
	    "fldz\n");

	p = matrix_a->offset +
	    matrix_a->info[0].mult * matrix_a->info[0].lbound;

	q = matrix_b->offset +
	    matrix_b->info[0].mult * matrix_b->info[0].lbound +
	    matrix_b->info[1].mult*(j + matrix_b->info[1].lbound);

	for(i=0; i<ext_a; i++) {
	    asm(LOAD_B);
	    asm(LOAD_A);
      
	    asm("fld %st(3)\n"
		"fmul %st(2)\n"
		"fadd %st(6)\n"
		"fstp %st(6)\n"
		"fld %st\n"
		"fmul %st(3)\n"
		"fsubr %st(6)\n"
		"fstp %st(6)\n"
		"fmul %st(3)\n"
		"fadd %st(4)\n"
		"fstp %st(4)\n"
		"fmulp %st(1)\n"
		"fadd %st(2)\n"
		"fstp %st(2)\n"
		"fstp %st\n");

	    p += dp;
	    q += dq;
	}

	product = r->base + 2*REAL10_SIZE*j;
	asm(RESULT);
    }

    return r;
}


g95_array_descriptor *SUBROUTINE21(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, ext_a1, ext_a2, ext_b, dp, dq;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b < 0)
	ext_b = 0;

    if (ext_a2 != ext_b)
	runtime_error(matrix_mismatch);

    r = temp_array(1, matrix_a->element_size, ext_a1);
    temp = 0;

    /* r(i) = \sum_j matrix_a(i,j) * matrix_b(j) */

    /* Sum things such that we traverse matrix A in ascending memory
     * locations, assuming it is contiguous.  This improves performance. */

    memset(r->base, '\0', r->element_size*ext_a1);

    dp = matrix_a->info[0].mult;
    dq = matrix_b->info[0].mult;

    q = matrix_b->offset + matrix_b->info[0].mult*matrix_b->info[0].lbound;

    for(i=0; i<ext_a2; i++) {
	p = matrix_a->offset +
	    matrix_a->info[0].mult*matrix_a->info[0].lbound +
	    matrix_a->info[1].mult * (i+matrix_a->info[1].lbound);

	for(j=0; j<ext_a1; j++) {
	    product = r->base + j*r->element_size;

	    asm(LOAD_RESULT);

	    asm(LOAD_B);
	    asm(LOAD_A);

	    asm("fld %st(3)\n"
		"fmul %st(2)\n"
		"fadd %st(6)\n"
		"fstp %st(6)\n"
		"fld %st\n"
		"fmul %st(3)\n"
		"fsubr %st(6)\n"
		"fstp %st(6)\n"
		"fmul %st(3)\n"
		"fadd %st(4)\n"
		"fstp %st(4)\n"
		"fmulp %st(1)\n"
		"fadd %st(2)\n"
		"fstp %st(2)\n"
		"fstp %st\n");

	    asm(RESULT);

	    p += dp;
	}

	q += dq;
    }

    return r;
}


g95_array_descriptor *SUBROUTINE22(g95_array_descriptor *matrix_a,
				   g95_array_descriptor *matrix_b) {
G95_AINT i, j, k, ext_a1, ext_a2, ext_b1, ext_b2, dp;
g95_array_descriptor *r;
char *p, *q, *product;
int temp;

    ext_a1 = matrix_a->info[0].ubound - matrix_a->info[0].lbound + 1;
    if (ext_a1 < 0)
	ext_a1 = 0;

    ext_a2 = matrix_a->info[1].ubound - matrix_a->info[1].lbound + 1;
    if (ext_a2 < 0)
	ext_a2 = 0;

    ext_b1 = matrix_b->info[0].ubound - matrix_b->info[0].lbound + 1;
    if (ext_b1 < 0)
	ext_b1 = 0;

    ext_b2 = matrix_b->info[1].ubound - matrix_b->info[1].lbound + 1;
    if (ext_b2 < 0)
	ext_b2 = 0;

    if (ext_a2 != ext_b1)
	runtime_error(matrix_mismatch);

    r = temp_array(2, matrix_a->element_size, ext_a1, ext_b2);
    temp = 0;

    /* r(i,j) = \sum_k matrix_a(i,k) * matrix_b(k,j) */

    memset(r->base, '\0', matrix_a->element_size * ext_a1 * ext_b2);

    dp = matrix_a->info[0].mult;

    for(j=0; j<ext_b2; j++) {
	for(k=0; k<ext_b1; k++) {
	    product = r->offset +
		r->info[0].mult*r->info[0].lbound +
		r->info[1].mult*(r->info[1].lbound + j);

	    q = matrix_b->offset +
		matrix_b->info[0].mult * (k + matrix_b->info[0].lbound) +
		matrix_b->info[1].mult * (j + matrix_b->info[1].lbound);

	    p = matrix_a->offset +
		matrix_a->info[0].mult * matrix_a->info[0].lbound +
		matrix_a->info[1].mult * (k + matrix_a->info[1].lbound);

	    for(i=0; i<ext_a1; i++) {
		asm(LOAD_RESULT);

		asm(LOAD_B);
		asm(LOAD_A);

		asm("fld %st(3)\n"
		    "fmul %st(2)\n"
		    "fadd %st(6)\n"
		    "fstp %st(6)\n"
		    "fld %st\n"
		    "fmul %st(3)\n"
		    "fsubr %st(6)\n"
		    "fstp %st(6)\n"
		    "fmul %st(3)\n"
		    "fadd %st(4)\n"
		    "fstp %st(4)\n"
		    "fmulp %st(1)\n"
		    "fadd %st(2)\n"
		    "fstp %st(2)\n"
		    "fstp %st\n");

		asm(RESULT);

		p += dp;

		product += matrix_a->element_size;
	    }
	}
    }

    return r;
}





#endif

#undef SUBROUTINE12
#undef SUBROUTINE21
#undef SUBROUTINE22
#undef LOAD_A
#undef LOAD_B
#undef RESULT
#undef LOAD_RESULT

#else

#include <string.h>
#include "runtime.h"

#if HAVE_REAL_10
static char matrix_mismatch[] = "Matrix size mismatch in MATMUL()";

#define COMPLEX    0

#define SUBROUTINE12     prefix(matmul12_r10i1)
#define SUBROUTINE21     prefix(matmul21_r10i1)
#define SUBROUTINE22     prefix(matmul22_r10i1)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "movsbl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (q), "m" (temp) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r10i2)
#define SUBROUTINE21     prefix(matmul21_r10i2)
#define SUBROUTINE22     prefix(matmul22_r10i2)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "movswl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (q), "m" (temp) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r10i4)
#define SUBROUTINE21     prefix(matmul21_r10i4)
#define SUBROUTINE22     prefix(matmul22_r10i4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fild ( %" EAX ")\n" : : "m" (q) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r10i8)
#define SUBROUTINE21     prefix(matmul21_r10i8)
#define SUBROUTINE22     prefix(matmul22_r10i8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fildll ( %" EAX ")\n" : : "m" (q), "m" (temp) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r10r4)
#define SUBROUTINE21     prefix(matmul21_r10r4)
#define SUBROUTINE22     prefix(matmul22_r10r4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" : : "m" (q), "m" (temp) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r10r8)
#define SUBROUTINE21     prefix(matmul21_r10r8)
#define SUBROUTINE22     prefix(matmul22_r10r8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" : : "m" (q), "m" (temp) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_i1r10)
#define SUBROUTINE21     prefix(matmul21_i1r10)
#define SUBROUTINE22     prefix(matmul22_i1r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "movsbl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX


#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_i2r10)
#define SUBROUTINE21     prefix(matmul21_i2r10)
#define SUBROUTINE22     prefix(matmul22_i2r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "movswl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_i4r10)
#define SUBROUTINE21     prefix(matmul21_i4r10)
#define SUBROUTINE22     prefix(matmul22_i4r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fild ( %" EAX ")\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_i8r10)
#define SUBROUTINE21     prefix(matmul21_i8r10)
#define SUBROUTINE22     prefix(matmul22_i8r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fildll ( %" EAX ")\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r4r10)
#define SUBROUTINE21     prefix(matmul21_r4r10)
#define SUBROUTINE22     prefix(matmul22_r4r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r8r10)
#define SUBROUTINE21     prefix(matmul21_r8r10)
#define SUBROUTINE22     prefix(matmul22_r8r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX

#include "matmul10.c"

#define SUBROUTINE12     prefix(matmul12_r10r10)
#define SUBROUTINE21     prefix(matmul21_r10r10)
#define SUBROUTINE22     prefix(matmul22_r10r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q), "m" (temp) : EAX

#include "matmul10.c"

#undef COMPLEX
#define COMPLEX 1

#define SUBROUTINE12    prefix(matmul12_z4r10)
#define SUBROUTINE21    prefix(matmul21_z4r10)
#define SUBROUTINE22    prefix(matmul22_z4r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt (%" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "flds (%" EAX ")\n" \
                    "flds 4(%" EAX ")\n" : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstps (%" EAX ")\n" \
                 "fstps 4(%" EAX ")\n" : : "m" (product)

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_z8r10)
#define SUBROUTINE21    prefix(matmul21_z8r10)
#define SUBROUTINE22    prefix(matmul22_z8r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" \
                 "fldl 8( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldl (%" EAX ")\n" \
                    "fldl 8(%" EAX ")\n" : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpl (%" EAX ")\n" \
                 "fstpl 8(%" EAX ")\n" : : "m" (product)

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_z10i1)
#define SUBROUTINE21    prefix(matmul21_z10i1)
#define SUBROUTINE22    prefix(matmul22_z10i1)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "movsbl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (q), "m" (temp) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_z10i2)
#define SUBROUTINE21    prefix(matmul21_z10i2)
#define SUBROUTINE22    prefix(matmul22_z10i2)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "movswl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (q), "m" (temp) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_z10i4)
#define SUBROUTINE21    prefix(matmul21_z10i4)
#define SUBROUTINE22    prefix(matmul22_z10i4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fild (%" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_z10i8)
#define SUBROUTINE21    prefix(matmul21_z10i8)
#define SUBROUTINE22    prefix(matmul22_z10i8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fildll (%" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_z10r4)
#define SUBROUTINE21    prefix(matmul21_z10r4)
#define SUBROUTINE22    prefix(matmul22_z10r4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds (%" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_z10r8)
#define SUBROUTINE21    prefix(matmul21_z10r8)
#define SUBROUTINE22    prefix(matmul22_z10r8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldl (%" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_z10r10)
#define SUBROUTINE21    prefix(matmul21_z10r10)
#define SUBROUTINE22    prefix(matmul22_z10r10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#undef COMPLEX
#define COMPLEX 2

#define SUBROUTINE12    prefix(matmul12_r10z4)
#define SUBROUTINE21    prefix(matmul21_r10z4)
#define SUBROUTINE22    prefix(matmul22_r10z4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "flds (%" EAX ")\n" \
                    "flds 4(%" EAX ")\n" : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstps (%" EAX ")\n" \
                 "fstps 4(%" EAX ")\n" : : "m" (product)

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_r10z8)
#define SUBROUTINE21    prefix(matmul21_r10z8)
#define SUBROUTINE22    prefix(matmul22_r10z8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" \
                 "fldl 8( %" EAX ")\n" : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldl (%" EAX ")\n" \
                    "fldl 8(%" EAX ")\n" : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpl (%" EAX ")\n" \
                 "fstpl 8(%" EAX ")\n" : : "m" (product)

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_i1z10)
#define SUBROUTINE21    prefix(matmul21_i1z10)
#define SUBROUTINE22    prefix(matmul22_i1z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "movsbl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_i2z10)
#define SUBROUTINE21    prefix(matmul21_i2z10)
#define SUBROUTINE22    prefix(matmul22_i2z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "movswl (%" EAX "), %%eax\n" \
                 "mov %%eax, %1\n" \
                 "fild %1\n" : : "m" (p), "m" (temp) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_i4z10)
#define SUBROUTINE21    prefix(matmul21_i4z10)
#define SUBROUTINE22    prefix(matmul22_i4z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fild (%" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_i8z10)
#define SUBROUTINE21    prefix(matmul21_i8z10)
#define SUBROUTINE22    prefix(matmul22_i8z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fildll (%" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_r4z10)
#define SUBROUTINE21    prefix(matmul21_r4z10)
#define SUBROUTINE22    prefix(matmul22_r4z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "flds (%" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_r8z10)
#define SUBROUTINE21    prefix(matmul21_r8z10)
#define SUBROUTINE22    prefix(matmul22_r8z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldl (%" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"


#define SUBROUTINE12    prefix(matmul12_r10z10)
#define SUBROUTINE21    prefix(matmul21_r10z10)
#define SUBROUTINE22    prefix(matmul22_r10z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                     : : "m" (product) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#include "matmul10.c"

#undef COMPLEX
#define COMPLEX 3

#define SUBROUTINE12    prefix(matmul12_z4z10)
#define SUBROUTINE21    prefix(matmul21_z4z10)
#define SUBROUTINE22    prefix(matmul22_z4z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) " (%" EAX ")\n" \
                    : : "m" (product) : EAX

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_z8z10)
#define SUBROUTINE21    prefix(matmul21_z8z10)
#define SUBROUTINE22    prefix(matmul22_z8z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldl ( %" EAX ")\n" \
                 "fldl 8( %" EAX ")\n" : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) " (%" EAX ")\n" \
                    : : "m" (product) : EAX

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_z10z4)
#define SUBROUTINE21    prefix(matmul21_z10z4)
#define SUBROUTINE22    prefix(matmul22_z10z4)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 4( %" EAX ")\n" : : "m" (q) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) " (%" EAX ")\n" \
                    : : "m" (product) : EAX

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_z10z8)
#define SUBROUTINE21    prefix(matmul21_z10z8)
#define SUBROUTINE22    prefix(matmul22_z10z8)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "flds ( %" EAX ")\n" \
                 "flds 8( %" EAX ")\n" : : "m" (q) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) " (%" EAX ")\n" \
                    : : "m" (product) : EAX

#include "matmul10.c"

#define SUBROUTINE12    prefix(matmul12_z10z10)
#define SUBROUTINE21    prefix(matmul21_z10z10)
#define SUBROUTINE22    prefix(matmul22_z10z10)

#define LOAD_A   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (p) : EAX

#define LOAD_B   "mov %0, %" EAX "\n" \
                 "fldt ( %" EAX ")\n" \
                 "fldt " stringize(REAL10_SIZE) "( %" EAX ")\n" \
                  : : "m" (q) : EAX

#define RESULT   "mov %0, %" EAX "\n" \
                 "fxch %%st(1)\n" \
                 "fstpt (%" EAX ")\n" \
                 "fstpt " stringize(REAL10_SIZE) "(%" EAX ")\n" \
                 : : "m" (product)

#define LOAD_RESULT "mov %0, %" EAX "\n" \
                    "fldt (%" EAX ")\n" \
                    "fldt " stringize(REAL10_SIZE) " (%" EAX ")\n" \
                    : : "m" (product) : EAX

#include "matmul10.c"

#endif
#endif

