Actual source code: baijfact2.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for BAIJ format.
5: */
7: #include src/mat/impls/baij/seq/baij.h
8: #include src/inline/ilu.h
9: #include src/inline/dot.h
13: PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14: {
15: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
17: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18: PetscInt *diag = a->diag;
19: MatScalar *aa=a->a,*v;
20: PetscScalar s1,*x,*b;
23: VecCopy(bb,xx);
24: VecGetArray(bb,&b);
25: VecGetArray(xx,&x);
26:
27: /* forward solve the U^T */
28: for (i=0; i<n; i++) {
30: v = aa + diag[i];
31: /* multiply by the inverse of the block diagonal */
32: s1 = (*v++)*x[i];
33: vi = aj + diag[i] + 1;
34: nz = ai[i+1] - diag[i] - 1;
35: while (nz--) {
36: x[*vi++] -= (*v++)*s1;
37: }
38: x[i] = s1;
39: }
40: /* backward solve the L^T */
41: for (i=n-1; i>=0; i--){
42: v = aa + diag[i] - 1;
43: vi = aj + diag[i] - 1;
44: nz = diag[i] - ai[i];
45: s1 = x[i];
46: while (nz--) {
47: x[*vi--] -= (*v--)*s1;
48: }
49: }
50: VecRestoreArray(bb,&b);
51: VecRestoreArray(xx,&x);
52: PetscLogFlops(2*(a->nz) - A->cmap.n);
53: return(0);
54: }
58: PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59: {
60: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
62: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63: PetscInt *diag = a->diag,oidx;
64: MatScalar *aa=a->a,*v;
65: PetscScalar s1,s2,x1,x2;
66: PetscScalar *x,*b;
69: VecCopy(bb,xx);
70: VecGetArray(bb,&b);
71: VecGetArray(xx,&x);
73: /* forward solve the U^T */
74: idx = 0;
75: for (i=0; i<n; i++) {
77: v = aa + 4*diag[i];
78: /* multiply by the inverse of the block diagonal */
79: x1 = x[idx]; x2 = x[1+idx];
80: s1 = v[0]*x1 + v[1]*x2;
81: s2 = v[2]*x1 + v[3]*x2;
82: v += 4;
84: vi = aj + diag[i] + 1;
85: nz = ai[i+1] - diag[i] - 1;
86: while (nz--) {
87: oidx = 2*(*vi++);
88: x[oidx] -= v[0]*s1 + v[1]*s2;
89: x[oidx+1] -= v[2]*s1 + v[3]*s2;
90: v += 4;
91: }
92: x[idx] = s1;x[1+idx] = s2;
93: idx += 2;
94: }
95: /* backward solve the L^T */
96: for (i=n-1; i>=0; i--){
97: v = aa + 4*diag[i] - 4;
98: vi = aj + diag[i] - 1;
99: nz = diag[i] - ai[i];
100: idt = 2*i;
101: s1 = x[idt]; s2 = x[1+idt];
102: while (nz--) {
103: idx = 2*(*vi--);
104: x[idx] -= v[0]*s1 + v[1]*s2;
105: x[idx+1] -= v[2]*s1 + v[3]*s2;
106: v -= 4;
107: }
108: }
109: VecRestoreArray(bb,&b);
110: VecRestoreArray(xx,&x);
111: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
112: return(0);
113: }
117: PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118: {
119: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
121: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122: PetscInt *diag = a->diag,oidx;
123: MatScalar *aa=a->a,*v;
124: PetscScalar s1,s2,s3,x1,x2,x3;
125: PetscScalar *x,*b;
128: VecCopy(bb,xx);
129: VecGetArray(bb,&b);
130: VecGetArray(xx,&x);
132: /* forward solve the U^T */
133: idx = 0;
134: for (i=0; i<n; i++) {
136: v = aa + 9*diag[i];
137: /* multiply by the inverse of the block diagonal */
138: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
139: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
140: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
141: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
142: v += 9;
144: vi = aj + diag[i] + 1;
145: nz = ai[i+1] - diag[i] - 1;
146: while (nz--) {
147: oidx = 3*(*vi++);
148: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
149: x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
150: x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151: v += 9;
152: }
153: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;
154: idx += 3;
155: }
156: /* backward solve the L^T */
157: for (i=n-1; i>=0; i--){
158: v = aa + 9*diag[i] - 9;
159: vi = aj + diag[i] - 1;
160: nz = diag[i] - ai[i];
161: idt = 3*i;
162: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];
163: while (nz--) {
164: idx = 3*(*vi--);
165: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
166: x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
167: x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168: v -= 9;
169: }
170: }
171: VecRestoreArray(bb,&b);
172: VecRestoreArray(xx,&x);
173: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
174: return(0);
175: }
179: PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180: {
181: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
183: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184: PetscInt *diag = a->diag,oidx;
185: MatScalar *aa=a->a,*v;
186: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
187: PetscScalar *x,*b;
190: VecCopy(bb,xx);
191: VecGetArray(bb,&b);
192: VecGetArray(xx,&x);
194: /* forward solve the U^T */
195: idx = 0;
196: for (i=0; i<n; i++) {
198: v = aa + 16*diag[i];
199: /* multiply by the inverse of the block diagonal */
200: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx];
201: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
202: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
203: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
204: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205: v += 16;
207: vi = aj + diag[i] + 1;
208: nz = ai[i+1] - diag[i] - 1;
209: while (nz--) {
210: oidx = 4*(*vi++);
211: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
212: x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
213: x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214: x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215: v += 16;
216: }
217: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218: idx += 4;
219: }
220: /* backward solve the L^T */
221: for (i=n-1; i>=0; i--){
222: v = aa + 16*diag[i] - 16;
223: vi = aj + diag[i] - 1;
224: nz = diag[i] - ai[i];
225: idt = 4*i;
226: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227: while (nz--) {
228: idx = 4*(*vi--);
229: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
230: x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
231: x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232: x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233: v -= 16;
234: }
235: }
236: VecRestoreArray(bb,&b);
237: VecRestoreArray(xx,&x);
238: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
239: return(0);
240: }
244: PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245: {
246: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
248: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249: PetscInt *diag = a->diag,oidx;
250: MatScalar *aa=a->a,*v;
251: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252: PetscScalar *x,*b;
255: VecCopy(bb,xx);
256: VecGetArray(bb,&b);
257: VecGetArray(xx,&x);
259: /* forward solve the U^T */
260: idx = 0;
261: for (i=0; i<n; i++) {
263: v = aa + 25*diag[i];
264: /* multiply by the inverse of the block diagonal */
265: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
267: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
268: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271: v += 25;
273: vi = aj + diag[i] + 1;
274: nz = ai[i+1] - diag[i] - 1;
275: while (nz--) {
276: oidx = 5*(*vi++);
277: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
278: x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
279: x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280: x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281: x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282: v += 25;
283: }
284: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285: idx += 5;
286: }
287: /* backward solve the L^T */
288: for (i=n-1; i>=0; i--){
289: v = aa + 25*diag[i] - 25;
290: vi = aj + diag[i] - 1;
291: nz = diag[i] - ai[i];
292: idt = 5*i;
293: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294: while (nz--) {
295: idx = 5*(*vi--);
296: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
297: x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
298: x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299: x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300: x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301: v -= 25;
302: }
303: }
304: VecRestoreArray(bb,&b);
305: VecRestoreArray(xx,&x);
306: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
307: return(0);
308: }
312: PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313: {
314: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
316: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317: PetscInt *diag = a->diag,oidx;
318: MatScalar *aa=a->a,*v;
319: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320: PetscScalar *x,*b;
323: VecCopy(bb,xx);
324: VecGetArray(bb,&b);
325: VecGetArray(xx,&x);
327: /* forward solve the U^T */
328: idx = 0;
329: for (i=0; i<n; i++) {
331: v = aa + 36*diag[i];
332: /* multiply by the inverse of the block diagonal */
333: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334: x6 = x[5+idx];
335: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
336: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
337: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341: v += 36;
343: vi = aj + diag[i] + 1;
344: nz = ai[i+1] - diag[i] - 1;
345: while (nz--) {
346: oidx = 6*(*vi++);
347: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
348: x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
349: x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350: x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351: x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352: x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353: v += 36;
354: }
355: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356: x[5+idx] = s6;
357: idx += 6;
358: }
359: /* backward solve the L^T */
360: for (i=n-1; i>=0; i--){
361: v = aa + 36*diag[i] - 36;
362: vi = aj + diag[i] - 1;
363: nz = diag[i] - ai[i];
364: idt = 6*i;
365: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366: s6 = x[5+idt];
367: while (nz--) {
368: idx = 6*(*vi--);
369: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
370: x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
371: x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372: x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373: x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374: x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375: v -= 36;
376: }
377: }
378: VecRestoreArray(bb,&b);
379: VecRestoreArray(xx,&x);
380: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
381: return(0);
382: }
386: PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387: {
388: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
390: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391: PetscInt *diag = a->diag,oidx;
392: MatScalar *aa=a->a,*v;
393: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394: PetscScalar *x,*b;
397: VecCopy(bb,xx);
398: VecGetArray(bb,&b);
399: VecGetArray(xx,&x);
401: /* forward solve the U^T */
402: idx = 0;
403: for (i=0; i<n; i++) {
405: v = aa + 49*diag[i];
406: /* multiply by the inverse of the block diagonal */
407: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408: x6 = x[5+idx]; x7 = x[6+idx];
409: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
410: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416: v += 49;
418: vi = aj + diag[i] + 1;
419: nz = ai[i+1] - diag[i] - 1;
420: while (nz--) {
421: oidx = 7*(*vi++);
422: x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
423: x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424: x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425: x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426: x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427: x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428: x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429: v += 49;
430: }
431: x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432: x[5+idx] = s6;x[6+idx] = s7;
433: idx += 7;
434: }
435: /* backward solve the L^T */
436: for (i=n-1; i>=0; i--){
437: v = aa + 49*diag[i] - 49;
438: vi = aj + diag[i] - 1;
439: nz = diag[i] - ai[i];
440: idt = 7*i;
441: s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442: s6 = x[5+idt];s7 = x[6+idt];
443: while (nz--) {
444: idx = 7*(*vi--);
445: x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
446: x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447: x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448: x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449: x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450: x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451: x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452: v -= 49;
453: }
454: }
455: VecRestoreArray(bb,&b);
456: VecRestoreArray(xx,&x);
457: PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
458: return(0);
459: }
461: /*---------------------------------------------------------------------------------------------*/
464: PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465: {
466: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
467: IS iscol=a->col,isrow=a->row;
469: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470: PetscInt *diag = a->diag;
471: MatScalar *aa=a->a,*v;
472: PetscScalar s1,*x,*b,*t;
475: VecGetArray(bb,&b);
476: VecGetArray(xx,&x);
477: t = a->solve_work;
479: ISGetIndices(isrow,&rout); r = rout;
480: ISGetIndices(iscol,&cout); c = cout;
482: /* copy the b into temp work space according to permutation */
483: for (i=0; i<n; i++) {
484: t[i] = b[c[i]];
485: }
487: /* forward solve the U^T */
488: for (i=0; i<n; i++) {
490: v = aa + diag[i];
491: /* multiply by the inverse of the block diagonal */
492: s1 = (*v++)*t[i];
493: vi = aj + diag[i] + 1;
494: nz = ai[i+1] - diag[i] - 1;
495: while (nz--) {
496: t[*vi++] -= (*v++)*s1;
497: }
498: t[i] = s1;
499: }
500: /* backward solve the L^T */
501: for (i=n-1; i>=0; i--){
502: v = aa + diag[i] - 1;
503: vi = aj + diag[i] - 1;
504: nz = diag[i] - ai[i];
505: s1 = t[i];
506: while (nz--) {
507: t[*vi--] -= (*v--)*s1;
508: }
509: }
511: /* copy t into x according to permutation */
512: for (i=0; i<n; i++) {
513: x[r[i]] = t[i];
514: }
516: ISRestoreIndices(isrow,&rout);
517: ISRestoreIndices(iscol,&cout);
518: VecRestoreArray(bb,&b);
519: VecRestoreArray(xx,&x);
520: PetscLogFlops(2*(a->nz) - A->cmap.n);
521: return(0);
522: }
526: PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527: {
528: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
529: IS iscol=a->col,isrow=a->row;
531: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532: PetscInt *diag = a->diag,ii,ic,ir,oidx;
533: MatScalar *aa=a->a,*v;
534: PetscScalar s1,s2,x1,x2;
535: PetscScalar *x,*b,*t;
538: VecGetArray(bb,&b);
539: VecGetArray(xx,&x);
540: t = a->solve_work;
542: ISGetIndices(isrow,&rout); r = rout;
543: ISGetIndices(iscol,&cout); c = cout;
545: /* copy the b into temp work space according to permutation */
546: ii = 0;
547: for (i=0; i<n; i++) {
548: ic = 2*c[i];
549: t[ii] = b[ic];
550: t[ii+1] = b[ic+1];
551: ii += 2;
552: }
554: /* forward solve the U^T */
555: idx = 0;
556: for (i=0; i<n; i++) {
558: v = aa + 4*diag[i];
559: /* multiply by the inverse of the block diagonal */
560: x1 = t[idx]; x2 = t[1+idx];
561: s1 = v[0]*x1 + v[1]*x2;
562: s2 = v[2]*x1 + v[3]*x2;
563: v += 4;
565: vi = aj + diag[i] + 1;
566: nz = ai[i+1] - diag[i] - 1;
567: while (nz--) {
568: oidx = 2*(*vi++);
569: t[oidx] -= v[0]*s1 + v[1]*s2;
570: t[oidx+1] -= v[2]*s1 + v[3]*s2;
571: v += 4;
572: }
573: t[idx] = s1;t[1+idx] = s2;
574: idx += 2;
575: }
576: /* backward solve the L^T */
577: for (i=n-1; i>=0; i--){
578: v = aa + 4*diag[i] - 4;
579: vi = aj + diag[i] - 1;
580: nz = diag[i] - ai[i];
581: idt = 2*i;
582: s1 = t[idt]; s2 = t[1+idt];
583: while (nz--) {
584: idx = 2*(*vi--);
585: t[idx] -= v[0]*s1 + v[1]*s2;
586: t[idx+1] -= v[2]*s1 + v[3]*s2;
587: v -= 4;
588: }
589: }
591: /* copy t into x according to permutation */
592: ii = 0;
593: for (i=0; i<n; i++) {
594: ir = 2*r[i];
595: x[ir] = t[ii];
596: x[ir+1] = t[ii+1];
597: ii += 2;
598: }
600: ISRestoreIndices(isrow,&rout);
601: ISRestoreIndices(iscol,&cout);
602: VecRestoreArray(bb,&b);
603: VecRestoreArray(xx,&x);
604: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
605: return(0);
606: }
610: PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611: {
612: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
613: IS iscol=a->col,isrow=a->row;
615: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616: PetscInt *diag = a->diag,ii,ic,ir,oidx;
617: MatScalar *aa=a->a,*v;
618: PetscScalar s1,s2,s3,x1,x2,x3;
619: PetscScalar *x,*b,*t;
622: VecGetArray(bb,&b);
623: VecGetArray(xx,&x);
624: t = a->solve_work;
626: ISGetIndices(isrow,&rout); r = rout;
627: ISGetIndices(iscol,&cout); c = cout;
629: /* copy the b into temp work space according to permutation */
630: ii = 0;
631: for (i=0; i<n; i++) {
632: ic = 3*c[i];
633: t[ii] = b[ic];
634: t[ii+1] = b[ic+1];
635: t[ii+2] = b[ic+2];
636: ii += 3;
637: }
639: /* forward solve the U^T */
640: idx = 0;
641: for (i=0; i<n; i++) {
643: v = aa + 9*diag[i];
644: /* multiply by the inverse of the block diagonal */
645: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
646: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3;
647: s2 = v[3]*x1 + v[4]*x2 + v[5]*x3;
648: s3 = v[6]*x1 + v[7]*x2 + v[8]*x3;
649: v += 9;
651: vi = aj + diag[i] + 1;
652: nz = ai[i+1] - diag[i] - 1;
653: while (nz--) {
654: oidx = 3*(*vi++);
655: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
656: t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
657: t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658: v += 9;
659: }
660: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;
661: idx += 3;
662: }
663: /* backward solve the L^T */
664: for (i=n-1; i>=0; i--){
665: v = aa + 9*diag[i] - 9;
666: vi = aj + diag[i] - 1;
667: nz = diag[i] - ai[i];
668: idt = 3*i;
669: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
670: while (nz--) {
671: idx = 3*(*vi--);
672: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3;
673: t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3;
674: t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675: v -= 9;
676: }
677: }
679: /* copy t into x according to permutation */
680: ii = 0;
681: for (i=0; i<n; i++) {
682: ir = 3*r[i];
683: x[ir] = t[ii];
684: x[ir+1] = t[ii+1];
685: x[ir+2] = t[ii+2];
686: ii += 3;
687: }
689: ISRestoreIndices(isrow,&rout);
690: ISRestoreIndices(iscol,&cout);
691: VecRestoreArray(bb,&b);
692: VecRestoreArray(xx,&x);
693: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
694: return(0);
695: }
699: PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700: {
701: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
702: IS iscol=a->col,isrow=a->row;
704: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705: PetscInt *diag = a->diag,ii,ic,ir,oidx;
706: MatScalar *aa=a->a,*v;
707: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
708: PetscScalar *x,*b,*t;
711: VecGetArray(bb,&b);
712: VecGetArray(xx,&x);
713: t = a->solve_work;
715: ISGetIndices(isrow,&rout); r = rout;
716: ISGetIndices(iscol,&cout); c = cout;
718: /* copy the b into temp work space according to permutation */
719: ii = 0;
720: for (i=0; i<n; i++) {
721: ic = 4*c[i];
722: t[ii] = b[ic];
723: t[ii+1] = b[ic+1];
724: t[ii+2] = b[ic+2];
725: t[ii+3] = b[ic+3];
726: ii += 4;
727: }
729: /* forward solve the U^T */
730: idx = 0;
731: for (i=0; i<n; i++) {
733: v = aa + 16*diag[i];
734: /* multiply by the inverse of the block diagonal */
735: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx];
736: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
737: s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
738: s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
739: s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740: v += 16;
742: vi = aj + diag[i] + 1;
743: nz = ai[i+1] - diag[i] - 1;
744: while (nz--) {
745: oidx = 4*(*vi++);
746: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
747: t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
748: t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749: t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750: v += 16;
751: }
752: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753: idx += 4;
754: }
755: /* backward solve the L^T */
756: for (i=n-1; i>=0; i--){
757: v = aa + 16*diag[i] - 16;
758: vi = aj + diag[i] - 1;
759: nz = diag[i] - ai[i];
760: idt = 4*i;
761: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762: while (nz--) {
763: idx = 4*(*vi--);
764: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4;
765: t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4;
766: t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767: t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768: v -= 16;
769: }
770: }
772: /* copy t into x according to permutation */
773: ii = 0;
774: for (i=0; i<n; i++) {
775: ir = 4*r[i];
776: x[ir] = t[ii];
777: x[ir+1] = t[ii+1];
778: x[ir+2] = t[ii+2];
779: x[ir+3] = t[ii+3];
780: ii += 4;
781: }
783: ISRestoreIndices(isrow,&rout);
784: ISRestoreIndices(iscol,&cout);
785: VecRestoreArray(bb,&b);
786: VecRestoreArray(xx,&x);
787: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
788: return(0);
789: }
793: PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794: {
795: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
796: IS iscol=a->col,isrow=a->row;
798: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799: PetscInt *diag = a->diag,ii,ic,ir,oidx;
800: MatScalar *aa=a->a,*v;
801: PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
802: PetscScalar *x,*b,*t;
805: VecGetArray(bb,&b);
806: VecGetArray(xx,&x);
807: t = a->solve_work;
809: ISGetIndices(isrow,&rout); r = rout;
810: ISGetIndices(iscol,&cout); c = cout;
812: /* copy the b into temp work space according to permutation */
813: ii = 0;
814: for (i=0; i<n; i++) {
815: ic = 5*c[i];
816: t[ii] = b[ic];
817: t[ii+1] = b[ic+1];
818: t[ii+2] = b[ic+2];
819: t[ii+3] = b[ic+3];
820: t[ii+4] = b[ic+4];
821: ii += 5;
822: }
824: /* forward solve the U^T */
825: idx = 0;
826: for (i=0; i<n; i++) {
828: v = aa + 25*diag[i];
829: /* multiply by the inverse of the block diagonal */
830: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
832: s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
833: s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834: s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835: s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836: v += 25;
838: vi = aj + diag[i] + 1;
839: nz = ai[i+1] - diag[i] - 1;
840: while (nz--) {
841: oidx = 5*(*vi++);
842: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
843: t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
844: t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845: t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846: t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847: v += 25;
848: }
849: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850: idx += 5;
851: }
852: /* backward solve the L^T */
853: for (i=n-1; i>=0; i--){
854: v = aa + 25*diag[i] - 25;
855: vi = aj + diag[i] - 1;
856: nz = diag[i] - ai[i];
857: idt = 5*i;
858: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859: while (nz--) {
860: idx = 5*(*vi--);
861: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5;
862: t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5;
863: t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864: t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865: t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866: v -= 25;
867: }
868: }
870: /* copy t into x according to permutation */
871: ii = 0;
872: for (i=0; i<n; i++) {
873: ir = 5*r[i];
874: x[ir] = t[ii];
875: x[ir+1] = t[ii+1];
876: x[ir+2] = t[ii+2];
877: x[ir+3] = t[ii+3];
878: x[ir+4] = t[ii+4];
879: ii += 5;
880: }
882: ISRestoreIndices(isrow,&rout);
883: ISRestoreIndices(iscol,&cout);
884: VecRestoreArray(bb,&b);
885: VecRestoreArray(xx,&x);
886: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
887: return(0);
888: }
892: PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893: {
894: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
895: IS iscol=a->col,isrow=a->row;
897: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898: PetscInt *diag = a->diag,ii,ic,ir,oidx;
899: MatScalar *aa=a->a,*v;
900: PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
901: PetscScalar *x,*b,*t;
904: VecGetArray(bb,&b);
905: VecGetArray(xx,&x);
906: t = a->solve_work;
908: ISGetIndices(isrow,&rout); r = rout;
909: ISGetIndices(iscol,&cout); c = cout;
911: /* copy the b into temp work space according to permutation */
912: ii = 0;
913: for (i=0; i<n; i++) {
914: ic = 6*c[i];
915: t[ii] = b[ic];
916: t[ii+1] = b[ic+1];
917: t[ii+2] = b[ic+2];
918: t[ii+3] = b[ic+3];
919: t[ii+4] = b[ic+4];
920: t[ii+5] = b[ic+5];
921: ii += 6;
922: }
924: /* forward solve the U^T */
925: idx = 0;
926: for (i=0; i<n; i++) {
928: v = aa + 36*diag[i];
929: /* multiply by the inverse of the block diagonal */
930: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931: x6 = t[5+idx];
932: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6;
933: s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6;
934: s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935: s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936: s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937: s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938: v += 36;
940: vi = aj + diag[i] + 1;
941: nz = ai[i+1] - diag[i] - 1;
942: while (nz--) {
943: oidx = 6*(*vi++);
944: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
945: t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
946: t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947: t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948: t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949: t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950: v += 36;
951: }
952: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953: t[5+idx] = s6;
954: idx += 6;
955: }
956: /* backward solve the L^T */
957: for (i=n-1; i>=0; i--){
958: v = aa + 36*diag[i] - 36;
959: vi = aj + diag[i] - 1;
960: nz = diag[i] - ai[i];
961: idt = 6*i;
962: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963: s6 = t[5+idt];
964: while (nz--) {
965: idx = 6*(*vi--);
966: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6;
967: t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6;
968: t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969: t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970: t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971: t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972: v -= 36;
973: }
974: }
976: /* copy t into x according to permutation */
977: ii = 0;
978: for (i=0; i<n; i++) {
979: ir = 6*r[i];
980: x[ir] = t[ii];
981: x[ir+1] = t[ii+1];
982: x[ir+2] = t[ii+2];
983: x[ir+3] = t[ii+3];
984: x[ir+4] = t[ii+4];
985: x[ir+5] = t[ii+5];
986: ii += 6;
987: }
989: ISRestoreIndices(isrow,&rout);
990: ISRestoreIndices(iscol,&cout);
991: VecRestoreArray(bb,&b);
992: VecRestoreArray(xx,&x);
993: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
994: return(0);
995: }
999: PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000: {
1001: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1002: IS iscol=a->col,isrow=a->row;
1004: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005: PetscInt *diag = a->diag,ii,ic,ir,oidx;
1006: MatScalar *aa=a->a,*v;
1007: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1008: PetscScalar *x,*b,*t;
1011: VecGetArray(bb,&b);
1012: VecGetArray(xx,&x);
1013: t = a->solve_work;
1015: ISGetIndices(isrow,&rout); r = rout;
1016: ISGetIndices(iscol,&cout); c = cout;
1018: /* copy the b into temp work space according to permutation */
1019: ii = 0;
1020: for (i=0; i<n; i++) {
1021: ic = 7*c[i];
1022: t[ii] = b[ic];
1023: t[ii+1] = b[ic+1];
1024: t[ii+2] = b[ic+2];
1025: t[ii+3] = b[ic+3];
1026: t[ii+4] = b[ic+4];
1027: t[ii+5] = b[ic+5];
1028: t[ii+6] = b[ic+6];
1029: ii += 7;
1030: }
1032: /* forward solve the U^T */
1033: idx = 0;
1034: for (i=0; i<n; i++) {
1036: v = aa + 49*diag[i];
1037: /* multiply by the inverse of the block diagonal */
1038: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039: x6 = t[5+idx]; x7 = t[6+idx];
1040: s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7;
1041: s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042: s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043: s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044: s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045: s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046: s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047: v += 49;
1049: vi = aj + diag[i] + 1;
1050: nz = ai[i+1] - diag[i] - 1;
1051: while (nz--) {
1052: oidx = 7*(*vi++);
1053: t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1054: t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055: t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056: t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057: t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058: t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059: t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060: v += 49;
1061: }
1062: t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063: t[5+idx] = s6;t[6+idx] = s7;
1064: idx += 7;
1065: }
1066: /* backward solve the L^T */
1067: for (i=n-1; i>=0; i--){
1068: v = aa + 49*diag[i] - 49;
1069: vi = aj + diag[i] - 1;
1070: nz = diag[i] - ai[i];
1071: idt = 7*i;
1072: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073: s6 = t[5+idt];s7 = t[6+idt];
1074: while (nz--) {
1075: idx = 7*(*vi--);
1076: t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7;
1077: t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078: t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079: t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080: t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081: t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082: t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083: v -= 49;
1084: }
1085: }
1087: /* copy t into x according to permutation */
1088: ii = 0;
1089: for (i=0; i<n; i++) {
1090: ir = 7*r[i];
1091: x[ir] = t[ii];
1092: x[ir+1] = t[ii+1];
1093: x[ir+2] = t[ii+2];
1094: x[ir+3] = t[ii+3];
1095: x[ir+4] = t[ii+4];
1096: x[ir+5] = t[ii+5];
1097: x[ir+6] = t[ii+6];
1098: ii += 7;
1099: }
1101: ISRestoreIndices(isrow,&rout);
1102: ISRestoreIndices(iscol,&cout);
1103: VecRestoreArray(bb,&b);
1104: VecRestoreArray(xx,&x);
1105: PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1106: return(0);
1107: }
1109: /* ----------------------------------------------------------- */
1112: PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1113: {
1114: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1115: IS iscol=a->col,isrow=a->row;
1117: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118: PetscInt nz,bs=A->rmap.bs,bs2=a->bs2,*rout,*cout;
1119: MatScalar *aa=a->a,*v;
1120: PetscScalar *x,*b,*s,*t,*ls;
1123: VecGetArray(bb,&b);
1124: VecGetArray(xx,&x);
1125: t = a->solve_work;
1127: ISGetIndices(isrow,&rout); r = rout;
1128: ISGetIndices(iscol,&cout); c = cout + (n-1);
1130: /* forward solve the lower triangular */
1131: PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));
1132: for (i=1; i<n; i++) {
1133: v = aa + bs2*ai[i];
1134: vi = aj + ai[i];
1135: nz = a->diag[i] - ai[i];
1136: s = t + bs*i;
1137: PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));
1138: while (nz--) {
1139: Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1140: v += bs2;
1141: }
1142: }
1143: /* backward solve the upper triangular */
1144: ls = a->solve_work + A->cmap.n;
1145: for (i=n-1; i>=0; i--){
1146: v = aa + bs2*(a->diag[i] + 1);
1147: vi = aj + a->diag[i] + 1;
1148: nz = ai[i+1] - a->diag[i] - 1;
1149: PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));
1150: while (nz--) {
1151: Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1152: v += bs2;
1153: }
1154: Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1155: PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));
1156: }
1158: ISRestoreIndices(isrow,&rout);
1159: ISRestoreIndices(iscol,&cout);
1160: VecRestoreArray(bb,&b);
1161: VecRestoreArray(xx,&x);
1162: PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap.bs*A->cmap.n);
1163: return(0);
1164: }
1168: PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1169: {
1170: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1171: IS iscol=a->col,isrow=a->row;
1173: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174: PetscInt *diag = a->diag;
1175: MatScalar *aa=a->a,*v;
1176: PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1177: PetscScalar *x,*b,*t;
1180: VecGetArray(bb,&b);
1181: VecGetArray(xx,&x);
1182: t = a->solve_work;
1184: ISGetIndices(isrow,&rout); r = rout;
1185: ISGetIndices(iscol,&cout); c = cout + (n-1);
1187: /* forward solve the lower triangular */
1188: idx = 7*(*r++);
1189: t[0] = b[idx]; t[1] = b[1+idx];
1190: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191: t[5] = b[5+idx]; t[6] = b[6+idx];
1193: for (i=1; i<n; i++) {
1194: v = aa + 49*ai[i];
1195: vi = aj + ai[i];
1196: nz = diag[i] - ai[i];
1197: idx = 7*(*r++);
1198: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199: s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1200: while (nz--) {
1201: idx = 7*(*vi++);
1202: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1203: x4 = t[3+idx];x5 = t[4+idx];
1204: x6 = t[5+idx];x7 = t[6+idx];
1205: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1212: v += 49;
1213: }
1214: idx = 7*i;
1215: t[idx] = s1;t[1+idx] = s2;
1216: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217: t[5+idx] = s6;t[6+idx] = s7;
1218: }
1219: /* backward solve the upper triangular */
1220: for (i=n-1; i>=0; i--){
1221: v = aa + 49*diag[i] + 49;
1222: vi = aj + diag[i] + 1;
1223: nz = ai[i+1] - diag[i] - 1;
1224: idt = 7*i;
1225: s1 = t[idt]; s2 = t[1+idt];
1226: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227: s6 = t[5+idt];s7 = t[6+idt];
1228: while (nz--) {
1229: idx = 7*(*vi++);
1230: x1 = t[idx]; x2 = t[1+idx];
1231: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232: x6 = t[5+idx]; x7 = t[6+idx];
1233: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1240: v += 49;
1241: }
1242: idc = 7*(*c--);
1243: v = aa + 49*diag[i];
1244: x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+
1245: v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246: x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247: v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248: x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249: v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250: x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251: v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252: x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253: v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254: x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255: v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256: x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257: v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1258: }
1260: ISRestoreIndices(isrow,&rout);
1261: ISRestoreIndices(iscol,&cout);
1262: VecRestoreArray(bb,&b);
1263: VecRestoreArray(xx,&x);
1264: PetscLogFlops(2*49*(a->nz) - 7*A->cmap.n);
1265: return(0);
1266: }
1270: PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1271: {
1272: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1273: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1275: PetscInt *diag = a->diag,jdx;
1276: MatScalar *aa=a->a,*v;
1277: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1280: VecGetArray(bb,&b);
1281: VecGetArray(xx,&x);
1282: /* forward solve the lower triangular */
1283: idx = 0;
1284: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1285: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1286: x[6] = b[6+idx];
1287: for (i=1; i<n; i++) {
1288: v = aa + 49*ai[i];
1289: vi = aj + ai[i];
1290: nz = diag[i] - ai[i];
1291: idx = 7*i;
1292: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1293: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1294: s7 = b[6+idx];
1295: while (nz--) {
1296: jdx = 7*(*vi++);
1297: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1298: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1299: x7 = x[6+jdx];
1300: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1301: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1302: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1303: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1304: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1305: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1306: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1307: v += 49;
1308: }
1309: x[idx] = s1;
1310: x[1+idx] = s2;
1311: x[2+idx] = s3;
1312: x[3+idx] = s4;
1313: x[4+idx] = s5;
1314: x[5+idx] = s6;
1315: x[6+idx] = s7;
1316: }
1317: /* backward solve the upper triangular */
1318: for (i=n-1; i>=0; i--){
1319: v = aa + 49*diag[i] + 49;
1320: vi = aj + diag[i] + 1;
1321: nz = ai[i+1] - diag[i] - 1;
1322: idt = 7*i;
1323: s1 = x[idt]; s2 = x[1+idt];
1324: s3 = x[2+idt]; s4 = x[3+idt];
1325: s5 = x[4+idt]; s6 = x[5+idt];
1326: s7 = x[6+idt];
1327: while (nz--) {
1328: idx = 7*(*vi++);
1329: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1330: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1331: x7 = x[6+idx];
1332: s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1333: s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1334: s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1335: s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1336: s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1337: s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1338: s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1339: v += 49;
1340: }
1341: v = aa + 49*diag[i];
1342: x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4
1343: + v[28]*s5 + v[35]*s6 + v[42]*s7;
1344: x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4
1345: + v[29]*s5 + v[36]*s6 + v[43]*s7;
1346: x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4
1347: + v[30]*s5 + v[37]*s6 + v[44]*s7;
1348: x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4
1349: + v[31]*s5 + v[38]*s6 + v[45]*s7;
1350: x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4
1351: + v[32]*s5 + v[39]*s6 + v[46]*s7;
1352: x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4
1353: + v[33]*s5 + v[40]*s6 + v[47]*s7;
1354: x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4
1355: + v[34]*s5 + v[41]*s6 + v[48]*s7;
1356: }
1358: VecRestoreArray(bb,&b);
1359: VecRestoreArray(xx,&x);
1360: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1361: return(0);
1362: }
1366: PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1367: {
1368: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1369: IS iscol=a->col,isrow=a->row;
1371: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1372: PetscInt *diag = a->diag;
1373: MatScalar *aa=a->a,*v;
1374: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1377: VecGetArray(bb,&b);
1378: VecGetArray(xx,&x);
1379: t = a->solve_work;
1381: ISGetIndices(isrow,&rout); r = rout;
1382: ISGetIndices(iscol,&cout); c = cout + (n-1);
1384: /* forward solve the lower triangular */
1385: idx = 6*(*r++);
1386: t[0] = b[idx]; t[1] = b[1+idx];
1387: t[2] = b[2+idx]; t[3] = b[3+idx];
1388: t[4] = b[4+idx]; t[5] = b[5+idx];
1389: for (i=1; i<n; i++) {
1390: v = aa + 36*ai[i];
1391: vi = aj + ai[i];
1392: nz = diag[i] - ai[i];
1393: idx = 6*(*r++);
1394: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1395: s5 = b[4+idx]; s6 = b[5+idx];
1396: while (nz--) {
1397: idx = 6*(*vi++);
1398: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
1399: x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1400: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1401: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1402: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1403: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1404: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1405: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1406: v += 36;
1407: }
1408: idx = 6*i;
1409: t[idx] = s1;t[1+idx] = s2;
1410: t[2+idx] = s3;t[3+idx] = s4;
1411: t[4+idx] = s5;t[5+idx] = s6;
1412: }
1413: /* backward solve the upper triangular */
1414: for (i=n-1; i>=0; i--){
1415: v = aa + 36*diag[i] + 36;
1416: vi = aj + diag[i] + 1;
1417: nz = ai[i+1] - diag[i] - 1;
1418: idt = 6*i;
1419: s1 = t[idt]; s2 = t[1+idt];
1420: s3 = t[2+idt];s4 = t[3+idt];
1421: s5 = t[4+idt];s6 = t[5+idt];
1422: while (nz--) {
1423: idx = 6*(*vi++);
1424: x1 = t[idx]; x2 = t[1+idx];
1425: x3 = t[2+idx]; x4 = t[3+idx];
1426: x5 = t[4+idx]; x6 = t[5+idx];
1427: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1428: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1429: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1430: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1431: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1432: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1433: v += 36;
1434: }
1435: idc = 6*(*c--);
1436: v = aa + 36*diag[i];
1437: x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+
1438: v[18]*s4+v[24]*s5+v[30]*s6;
1439: x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1440: v[19]*s4+v[25]*s5+v[31]*s6;
1441: x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1442: v[20]*s4+v[26]*s5+v[32]*s6;
1443: x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1444: v[21]*s4+v[27]*s5+v[33]*s6;
1445: x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1446: v[22]*s4+v[28]*s5+v[34]*s6;
1447: x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1448: v[23]*s4+v[29]*s5+v[35]*s6;
1449: }
1451: ISRestoreIndices(isrow,&rout);
1452: ISRestoreIndices(iscol,&cout);
1453: VecRestoreArray(bb,&b);
1454: VecRestoreArray(xx,&x);
1455: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1456: return(0);
1457: }
1461: PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1462: {
1463: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1464: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1466: PetscInt *diag = a->diag,jdx;
1467: MatScalar *aa=a->a,*v;
1468: PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1471: VecGetArray(bb,&b);
1472: VecGetArray(xx,&x);
1473: /* forward solve the lower triangular */
1474: idx = 0;
1475: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx];
1476: x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1477: for (i=1; i<n; i++) {
1478: v = aa + 36*ai[i];
1479: vi = aj + ai[i];
1480: nz = diag[i] - ai[i];
1481: idx = 6*i;
1482: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
1483: s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1484: while (nz--) {
1485: jdx = 6*(*vi++);
1486: x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx];
1487: x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1488: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1489: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1490: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1491: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1492: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1493: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1494: v += 36;
1495: }
1496: x[idx] = s1;
1497: x[1+idx] = s2;
1498: x[2+idx] = s3;
1499: x[3+idx] = s4;
1500: x[4+idx] = s5;
1501: x[5+idx] = s6;
1502: }
1503: /* backward solve the upper triangular */
1504: for (i=n-1; i>=0; i--){
1505: v = aa + 36*diag[i] + 36;
1506: vi = aj + diag[i] + 1;
1507: nz = ai[i+1] - diag[i] - 1;
1508: idt = 6*i;
1509: s1 = x[idt]; s2 = x[1+idt];
1510: s3 = x[2+idt]; s4 = x[3+idt];
1511: s5 = x[4+idt]; s6 = x[5+idt];
1512: while (nz--) {
1513: idx = 6*(*vi++);
1514: x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];
1515: x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1516: s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1517: s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1518: s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1519: s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1520: s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1521: s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1522: v += 36;
1523: }
1524: v = aa + 36*diag[i];
1525: x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1526: x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1527: x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1528: x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1529: x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1530: x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1531: }
1533: VecRestoreArray(bb,&b);
1534: VecRestoreArray(xx,&x);
1535: PetscLogFlops(2*36*(a->nz) - 6*A->cmap.n);
1536: return(0);
1537: }
1541: PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1542: {
1543: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
1544: IS iscol=a->col,isrow=a->row;
1546: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1547: PetscInt *diag = a->diag;
1548: MatScalar *aa=a->a,*v;
1549: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1552: VecGetArray(bb,&b);
1553: VecGetArray(xx,&x);
1554: t = a->solve_work;
1556: ISGetIndices(isrow,&rout); r = rout;
1557: ISGetIndices(iscol,&cout); c = cout + (n-1);
1559: /* forward solve the lower triangular */
1560: idx = 5*(*r++);
1561: t[0] = b[idx]; t[1] = b[1+idx];
1562: t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1563: for (i=1; i<n; i++) {
1564: v = aa + 25*ai[i];
1565: vi = aj + ai[i];
1566: nz = diag[i] - ai[i];
1567: idx = 5*(*r++);
1568: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1569: s5 = b[4+idx];
1570: while (nz--) {
1571: idx = 5*(*vi++);
1572: x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx];
1573: x4 = t[3+idx];x5 = t[4+idx];
1574: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1575: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1576: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1577: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1578: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1579: v += 25;
1580: }
1581: idx = 5*i;
1582: t[idx] = s1;t[1+idx] = s2;
1583: t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1584: }
1585: /* backward solve the upper triangular */
1586: for (i=n-1; i>=0; i--){
1587: v = aa + 25*diag[i] + 25;
1588: vi = aj + diag[i] + 1;
1589: nz = ai[i+1] - diag[i] - 1;
1590: idt = 5*i;
1591: s1 = t[idt]; s2 = t[1+idt];
1592: s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1593: while (nz--) {
1594: idx = 5*(*vi++);
1595: x1 = t[idx]; x2 = t[1+idx];
1596: x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1597: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1598: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1599: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1600: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1601: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1602: v += 25;
1603: }
1604: idc = 5*(*c--);
1605: v = aa + 25*diag[i];
1606: x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+
1607: v[15]*s4+v[20]*s5;
1608: x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1609: v[16]*s4+v[21]*s5;
1610: x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1611: v[17]*s4+v[22]*s5;
1612: x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1613: v[18]*s4+v[23]*s5;
1614: x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1615: v[19]*s4+v[24]*s5;
1616: }
1618: ISRestoreIndices(isrow,&rout);
1619: ISRestoreIndices(iscol,&cout);
1620: VecRestoreArray(bb,&b);
1621: VecRestoreArray(xx,&x);
1622: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1623: return(0);
1624: }
1628: PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1629: {
1630: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1631: PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1633: PetscInt *diag = a->diag,jdx;
1634: MatScalar *aa=a->a,*v;
1635: PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1638: VecGetArray(bb,&b);
1639: VecGetArray(xx,&x);
1640: /* forward solve the lower triangular */
1641: idx = 0;
1642: x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1643: for (i=1; i<n; i++) {
1644: v = aa + 25*ai[i];
1645: vi = aj + ai[i];
1646: nz = diag[i] - ai[i];
1647: idx = 5*i;
1648: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1649: while (nz--) {
1650: jdx = 5*(*vi++);
1651: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1652: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1653: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1654: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1655: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1656: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1657: v += 25;
1658: }
1659: x[idx] = s1;
1660: x[1+idx] = s2;
1661: x[2+idx] = s3;
1662: x[3+idx] = s4;
1663: x[4+idx] = s5;
1664: }
1665: /* backward solve the upper triangular */
1666: for (i=n-1; i>=0; i--){
1667: v = aa + 25*diag[i] + 25;
1668: vi = aj + diag[i] + 1;
1669: nz = ai[i+1] - diag[i] - 1;
1670: idt = 5*i;
1671: s1 = x[idt]; s2 = x[1+idt];
1672: s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1673: while (nz--) {
1674: idx = 5*(*vi++);
1675: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1676: s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1677: s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1678: s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1679: s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1680: s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1681: v += 25;
1682: }
1683: v = aa + 25*diag[i];
1684: x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5;
1685: x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5;
1686: x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5;
1687: x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5;
1688: x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5;
1689: }
1691: VecRestoreArray(bb,&b);
1692: VecRestoreArray(xx,&x);
1693: PetscLogFlops(2*25*(a->nz) - 5*A->cmap.n);
1694: return(0);
1695: }
1699: PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1700: {
1701: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1702: IS iscol=a->col,isrow=a->row;
1704: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1705: PetscInt *diag = a->diag;
1706: MatScalar *aa=a->a,*v;
1707: PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1710: VecGetArray(bb,&b);
1711: VecGetArray(xx,&x);
1712: t = a->solve_work;
1714: ISGetIndices(isrow,&rout); r = rout;
1715: ISGetIndices(iscol,&cout); c = cout + (n-1);
1717: /* forward solve the lower triangular */
1718: idx = 4*(*r++);
1719: t[0] = b[idx]; t[1] = b[1+idx];
1720: t[2] = b[2+idx]; t[3] = b[3+idx];
1721: for (i=1; i<n; i++) {
1722: v = aa + 16*ai[i];
1723: vi = aj + ai[i];
1724: nz = diag[i] - ai[i];
1725: idx = 4*(*r++);
1726: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1727: while (nz--) {
1728: idx = 4*(*vi++);
1729: x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1730: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1731: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1732: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1733: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1734: v += 16;
1735: }
1736: idx = 4*i;
1737: t[idx] = s1;t[1+idx] = s2;
1738: t[2+idx] = s3;t[3+idx] = s4;
1739: }
1740: /* backward solve the upper triangular */
1741: for (i=n-1; i>=0; i--){
1742: v = aa + 16*diag[i] + 16;
1743: vi = aj + diag[i] + 1;
1744: nz = ai[i+1] - diag[i] - 1;
1745: idt = 4*i;
1746: s1 = t[idt]; s2 = t[1+idt];
1747: s3 = t[2+idt];s4 = t[3+idt];
1748: while (nz--) {
1749: idx = 4*(*vi++);
1750: x1 = t[idx]; x2 = t[1+idx];
1751: x3 = t[2+idx]; x4 = t[3+idx];
1752: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1753: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1754: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1755: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1756: v += 16;
1757: }
1758: idc = 4*(*c--);
1759: v = aa + 16*diag[i];
1760: x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1761: x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1762: x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1763: x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1764: }
1766: ISRestoreIndices(isrow,&rout);
1767: ISRestoreIndices(iscol,&cout);
1768: VecRestoreArray(bb,&b);
1769: VecRestoreArray(xx,&x);
1770: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1771: return(0);
1772: }
1776: PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1777: {
1778: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1779: IS iscol=a->col,isrow=a->row;
1781: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1782: PetscInt *diag = a->diag;
1783: MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1784: PetscScalar *x,*b;
1787: VecGetArray(bb,&b);
1788: VecGetArray(xx,&x);
1789: t = (MatScalar *)a->solve_work;
1791: ISGetIndices(isrow,&rout); r = rout;
1792: ISGetIndices(iscol,&cout); c = cout + (n-1);
1794: /* forward solve the lower triangular */
1795: idx = 4*(*r++);
1796: t[0] = (MatScalar)b[idx];
1797: t[1] = (MatScalar)b[1+idx];
1798: t[2] = (MatScalar)b[2+idx];
1799: t[3] = (MatScalar)b[3+idx];
1800: for (i=1; i<n; i++) {
1801: v = aa + 16*ai[i];
1802: vi = aj + ai[i];
1803: nz = diag[i] - ai[i];
1804: idx = 4*(*r++);
1805: s1 = (MatScalar)b[idx];
1806: s2 = (MatScalar)b[1+idx];
1807: s3 = (MatScalar)b[2+idx];
1808: s4 = (MatScalar)b[3+idx];
1809: while (nz--) {
1810: idx = 4*(*vi++);
1811: x1 = t[idx];
1812: x2 = t[1+idx];
1813: x3 = t[2+idx];
1814: x4 = t[3+idx];
1815: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1816: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1817: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1818: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1819: v += 16;
1820: }
1821: idx = 4*i;
1822: t[idx] = s1;
1823: t[1+idx] = s2;
1824: t[2+idx] = s3;
1825: t[3+idx] = s4;
1826: }
1827: /* backward solve the upper triangular */
1828: for (i=n-1; i>=0; i--){
1829: v = aa + 16*diag[i] + 16;
1830: vi = aj + diag[i] + 1;
1831: nz = ai[i+1] - diag[i] - 1;
1832: idt = 4*i;
1833: s1 = t[idt];
1834: s2 = t[1+idt];
1835: s3 = t[2+idt];
1836: s4 = t[3+idt];
1837: while (nz--) {
1838: idx = 4*(*vi++);
1839: x1 = t[idx];
1840: x2 = t[1+idx];
1841: x3 = t[2+idx];
1842: x4 = t[3+idx];
1843: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1844: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1845: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1846: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1847: v += 16;
1848: }
1849: idc = 4*(*c--);
1850: v = aa + 16*diag[i];
1851: t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1852: t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1853: t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1854: t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1855: x[idc] = (PetscScalar)t[idt];
1856: x[1+idc] = (PetscScalar)t[1+idt];
1857: x[2+idc] = (PetscScalar)t[2+idt];
1858: x[3+idc] = (PetscScalar)t[3+idt];
1859: }
1861: ISRestoreIndices(isrow,&rout);
1862: ISRestoreIndices(iscol,&cout);
1863: VecRestoreArray(bb,&b);
1864: VecRestoreArray(xx,&x);
1865: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
1866: return(0);
1867: }
1869: #if defined (PETSC_HAVE_SSE)
1871: #include PETSC_HAVE_SSE
1875: PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
1876: {
1877: /*
1878: Note: This code uses demotion of double
1879: to float when performing the mixed-mode computation.
1880: This may not be numerically reasonable for all applications.
1881: */
1882: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1883: IS iscol=a->col,isrow=a->row;
1885: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1886: PetscInt *diag = a->diag,ai16;
1887: MatScalar *aa=a->a,*v;
1888: PetscScalar *x,*b,*t;
1890: /* Make space in temp stack for 16 Byte Aligned arrays */
1891: float ssealignedspace[11],*tmps,*tmpx;
1892: unsigned long offset;
1893:
1895: SSE_SCOPE_BEGIN;
1897: offset = (unsigned long)ssealignedspace % 16;
1898: if (offset) offset = (16 - offset)/4;
1899: tmps = &ssealignedspace[offset];
1900: tmpx = &ssealignedspace[offset+4];
1901: PREFETCH_NTA(aa+16*ai[1]);
1903: VecGetArray(bb,&b);
1904: VecGetArray(xx,&x);
1905: t = a->solve_work;
1907: ISGetIndices(isrow,&rout); r = rout;
1908: ISGetIndices(iscol,&cout); c = cout + (n-1);
1910: /* forward solve the lower triangular */
1911: idx = 4*(*r++);
1912: t[0] = b[idx]; t[1] = b[1+idx];
1913: t[2] = b[2+idx]; t[3] = b[3+idx];
1914: v = aa + 16*ai[1];
1916: for (i=1; i<n;) {
1917: PREFETCH_NTA(&v[8]);
1918: vi = aj + ai[i];
1919: nz = diag[i] - ai[i];
1920: idx = 4*(*r++);
1922: /* Demote sum from double to float */
1923: CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
1924: LOAD_PS(tmps,XMM7);
1926: while (nz--) {
1927: PREFETCH_NTA(&v[16]);
1928: idx = 4*(*vi++);
1929:
1930: /* Demote solution (so far) from double to float */
1931: CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
1933: /* 4x4 Matrix-Vector product with negative accumulation: */
1934: SSE_INLINE_BEGIN_2(tmpx,v)
1935: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1937: /* First Column */
1938: SSE_COPY_PS(XMM0,XMM6)
1939: SSE_SHUFFLE(XMM0,XMM0,0x00)
1940: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
1941: SSE_SUB_PS(XMM7,XMM0)
1942:
1943: /* Second Column */
1944: SSE_COPY_PS(XMM1,XMM6)
1945: SSE_SHUFFLE(XMM1,XMM1,0x55)
1946: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
1947: SSE_SUB_PS(XMM7,XMM1)
1948:
1949: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
1950:
1951: /* Third Column */
1952: SSE_COPY_PS(XMM2,XMM6)
1953: SSE_SHUFFLE(XMM2,XMM2,0xAA)
1954: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
1955: SSE_SUB_PS(XMM7,XMM2)
1957: /* Fourth Column */
1958: SSE_COPY_PS(XMM3,XMM6)
1959: SSE_SHUFFLE(XMM3,XMM3,0xFF)
1960: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
1961: SSE_SUB_PS(XMM7,XMM3)
1962: SSE_INLINE_END_2
1963:
1964: v += 16;
1965: }
1966: idx = 4*i;
1967: v = aa + 16*ai[++i];
1968: PREFETCH_NTA(v);
1969: STORE_PS(tmps,XMM7);
1971: /* Promote result from float to double */
1972: CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
1973: }
1974: /* backward solve the upper triangular */
1975: idt = 4*(n-1);
1976: ai16 = 16*diag[n-1];
1977: v = aa + ai16 + 16;
1978: for (i=n-1; i>=0;){
1979: PREFETCH_NTA(&v[8]);
1980: vi = aj + diag[i] + 1;
1981: nz = ai[i+1] - diag[i] - 1;
1982:
1983: /* Demote accumulator from double to float */
1984: CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
1985: LOAD_PS(tmps,XMM7);
1987: while (nz--) {
1988: PREFETCH_NTA(&v[16]);
1989: idx = 4*(*vi++);
1991: /* Demote solution (so far) from double to float */
1992: CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
1994: /* 4x4 Matrix-Vector Product with negative accumulation: */
1995: SSE_INLINE_BEGIN_2(tmpx,v)
1996: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
1998: /* First Column */
1999: SSE_COPY_PS(XMM0,XMM6)
2000: SSE_SHUFFLE(XMM0,XMM0,0x00)
2001: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2002: SSE_SUB_PS(XMM7,XMM0)
2004: /* Second Column */
2005: SSE_COPY_PS(XMM1,XMM6)
2006: SSE_SHUFFLE(XMM1,XMM1,0x55)
2007: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2008: SSE_SUB_PS(XMM7,XMM1)
2010: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2011:
2012: /* Third Column */
2013: SSE_COPY_PS(XMM2,XMM6)
2014: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2015: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2016: SSE_SUB_PS(XMM7,XMM2)
2018: /* Fourth Column */
2019: SSE_COPY_PS(XMM3,XMM6)
2020: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2021: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2022: SSE_SUB_PS(XMM7,XMM3)
2023: SSE_INLINE_END_2
2024: v += 16;
2025: }
2026: v = aa + ai16;
2027: ai16 = 16*diag[--i];
2028: PREFETCH_NTA(aa+ai16+16);
2029: /*
2030: Scale the result by the diagonal 4x4 block,
2031: which was inverted as part of the factorization
2032: */
2033: SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2034: /* First Column */
2035: SSE_COPY_PS(XMM0,XMM7)
2036: SSE_SHUFFLE(XMM0,XMM0,0x00)
2037: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2039: /* Second Column */
2040: SSE_COPY_PS(XMM1,XMM7)
2041: SSE_SHUFFLE(XMM1,XMM1,0x55)
2042: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2043: SSE_ADD_PS(XMM0,XMM1)
2045: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2046:
2047: /* Third Column */
2048: SSE_COPY_PS(XMM2,XMM7)
2049: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2050: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2051: SSE_ADD_PS(XMM0,XMM2)
2053: /* Fourth Column */
2054: SSE_COPY_PS(XMM3,XMM7)
2055: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2056: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2057: SSE_ADD_PS(XMM0,XMM3)
2058:
2059: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2060: SSE_INLINE_END_3
2062: /* Promote solution from float to double */
2063: CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2065: /* Apply reordering to t and stream into x. */
2066: /* This way, x doesn't pollute the cache. */
2067: /* Be careful with size: 2 doubles = 4 floats! */
2068: idc = 4*(*c--);
2069: SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2070: /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */
2071: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2072: SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2073: /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2074: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2075: SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2076: SSE_INLINE_END_2
2077: v = aa + ai16 + 16;
2078: idt -= 4;
2079: }
2081: ISRestoreIndices(isrow,&rout);
2082: ISRestoreIndices(iscol,&cout);
2083: VecRestoreArray(bb,&b);
2084: VecRestoreArray(xx,&x);
2085: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2086: SSE_SCOPE_END;
2087: return(0);
2088: }
2090: #endif
2093: /*
2094: Special case where the matrix was ILU(0) factored in the natural
2095: ordering. This eliminates the need for the column and row permutation.
2096: */
2099: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2100: {
2101: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2102: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2104: PetscInt *diag = a->diag;
2105: MatScalar *aa=a->a;
2106: PetscScalar *x,*b;
2109: VecGetArray(bb,&b);
2110: VecGetArray(xx,&x);
2112: #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2113: {
2114: static PetscScalar w[2000]; /* very BAD need to fix */
2115: fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2116: }
2117: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2118: {
2119: static PetscScalar w[2000]; /* very BAD need to fix */
2120: fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2121: }
2122: #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2123: fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2124: #else
2125: {
2126: PetscScalar s1,s2,s3,s4,x1,x2,x3,x4;
2127: MatScalar *v;
2128: PetscInt jdx,idt,idx,nz,*vi,i,ai16;
2130: /* forward solve the lower triangular */
2131: idx = 0;
2132: x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2133: for (i=1; i<n; i++) {
2134: v = aa + 16*ai[i];
2135: vi = aj + ai[i];
2136: nz = diag[i] - ai[i];
2137: idx += 4;
2138: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2139: while (nz--) {
2140: jdx = 4*(*vi++);
2141: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2142: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2143: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2144: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2145: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2146: v += 16;
2147: }
2148: x[idx] = s1;
2149: x[1+idx] = s2;
2150: x[2+idx] = s3;
2151: x[3+idx] = s4;
2152: }
2153: /* backward solve the upper triangular */
2154: idt = 4*(n-1);
2155: for (i=n-1; i>=0; i--){
2156: ai16 = 16*diag[i];
2157: v = aa + ai16 + 16;
2158: vi = aj + diag[i] + 1;
2159: nz = ai[i+1] - diag[i] - 1;
2160: s1 = x[idt]; s2 = x[1+idt];
2161: s3 = x[2+idt];s4 = x[3+idt];
2162: while (nz--) {
2163: idx = 4*(*vi++);
2164: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx];
2165: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2166: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2167: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2168: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2169: v += 16;
2170: }
2171: v = aa + ai16;
2172: x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2173: x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
2174: x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2175: x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2176: idt -= 4;
2177: }
2178: }
2179: #endif
2181: VecRestoreArray(bb,&b);
2182: VecRestoreArray(xx,&x);
2183: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2184: return(0);
2185: }
2189: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2190: {
2191: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2192: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2194: PetscInt *diag = a->diag;
2195: MatScalar *aa=a->a;
2196: PetscScalar *x,*b;
2199: VecGetArray(bb,&b);
2200: VecGetArray(xx,&x);
2202: {
2203: MatScalar s1,s2,s3,s4,x1,x2,x3,x4;
2204: MatScalar *v,*t=(MatScalar *)x;
2205: PetscInt jdx,idt,idx,nz,*vi,i,ai16;
2207: /* forward solve the lower triangular */
2208: idx = 0;
2209: t[0] = (MatScalar)b[0];
2210: t[1] = (MatScalar)b[1];
2211: t[2] = (MatScalar)b[2];
2212: t[3] = (MatScalar)b[3];
2213: for (i=1; i<n; i++) {
2214: v = aa + 16*ai[i];
2215: vi = aj + ai[i];
2216: nz = diag[i] - ai[i];
2217: idx += 4;
2218: s1 = (MatScalar)b[idx];
2219: s2 = (MatScalar)b[1+idx];
2220: s3 = (MatScalar)b[2+idx];
2221: s4 = (MatScalar)b[3+idx];
2222: while (nz--) {
2223: jdx = 4*(*vi++);
2224: x1 = t[jdx];
2225: x2 = t[1+jdx];
2226: x3 = t[2+jdx];
2227: x4 = t[3+jdx];
2228: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2229: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2230: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2231: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2232: v += 16;
2233: }
2234: t[idx] = s1;
2235: t[1+idx] = s2;
2236: t[2+idx] = s3;
2237: t[3+idx] = s4;
2238: }
2239: /* backward solve the upper triangular */
2240: idt = 4*(n-1);
2241: for (i=n-1; i>=0; i--){
2242: ai16 = 16*diag[i];
2243: v = aa + ai16 + 16;
2244: vi = aj + diag[i] + 1;
2245: nz = ai[i+1] - diag[i] - 1;
2246: s1 = t[idt];
2247: s2 = t[1+idt];
2248: s3 = t[2+idt];
2249: s4 = t[3+idt];
2250: while (nz--) {
2251: idx = 4*(*vi++);
2252: x1 = (MatScalar)x[idx];
2253: x2 = (MatScalar)x[1+idx];
2254: x3 = (MatScalar)x[2+idx];
2255: x4 = (MatScalar)x[3+idx];
2256: s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2257: s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2258: s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2259: s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2260: v += 16;
2261: }
2262: v = aa + ai16;
2263: x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4);
2264: x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4);
2265: x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2266: x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2267: idt -= 4;
2268: }
2269: }
2271: VecRestoreArray(bb,&b);
2272: VecRestoreArray(xx,&x);
2273: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2274: return(0);
2275: }
2277: #if defined (PETSC_HAVE_SSE)
2279: #include PETSC_HAVE_SSE
2282: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2283: {
2284: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2285: unsigned short *aj=(unsigned short *)a->j;
2287: int *ai=a->i,n=a->mbs,*diag = a->diag;
2288: MatScalar *aa=a->a;
2289: PetscScalar *x,*b;
2292: SSE_SCOPE_BEGIN;
2293: /*
2294: Note: This code currently uses demotion of double
2295: to float when performing the mixed-mode computation.
2296: This may not be numerically reasonable for all applications.
2297: */
2298: PREFETCH_NTA(aa+16*ai[1]);
2300: VecGetArray(bb,&b);
2301: VecGetArray(xx,&x);
2302: {
2303: /* x will first be computed in single precision then promoted inplace to double */
2304: MatScalar *v,*t=(MatScalar *)x;
2305: int nz,i,idt,ai16;
2306: unsigned int jdx,idx;
2307: unsigned short *vi;
2308: /* Forward solve the lower triangular factor. */
2310: /* First block is the identity. */
2311: idx = 0;
2312: CONVERT_DOUBLE4_FLOAT4(t,b);
2313: v = aa + 16*((unsigned int)ai[1]);
2315: for (i=1; i<n;) {
2316: PREFETCH_NTA(&v[8]);
2317: vi = aj + ai[i];
2318: nz = diag[i] - ai[i];
2319: idx += 4;
2321: /* Demote RHS from double to float. */
2322: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2323: LOAD_PS(&t[idx],XMM7);
2325: while (nz--) {
2326: PREFETCH_NTA(&v[16]);
2327: jdx = 4*((unsigned int)(*vi++));
2328:
2329: /* 4x4 Matrix-Vector product with negative accumulation: */
2330: SSE_INLINE_BEGIN_2(&t[jdx],v)
2331: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2333: /* First Column */
2334: SSE_COPY_PS(XMM0,XMM6)
2335: SSE_SHUFFLE(XMM0,XMM0,0x00)
2336: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2337: SSE_SUB_PS(XMM7,XMM0)
2339: /* Second Column */
2340: SSE_COPY_PS(XMM1,XMM6)
2341: SSE_SHUFFLE(XMM1,XMM1,0x55)
2342: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2343: SSE_SUB_PS(XMM7,XMM1)
2345: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2346:
2347: /* Third Column */
2348: SSE_COPY_PS(XMM2,XMM6)
2349: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2350: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2351: SSE_SUB_PS(XMM7,XMM2)
2353: /* Fourth Column */
2354: SSE_COPY_PS(XMM3,XMM6)
2355: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2356: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2357: SSE_SUB_PS(XMM7,XMM3)
2358: SSE_INLINE_END_2
2359:
2360: v += 16;
2361: }
2362: v = aa + 16*ai[++i];
2363: PREFETCH_NTA(v);
2364: STORE_PS(&t[idx],XMM7);
2365: }
2367: /* Backward solve the upper triangular factor.*/
2369: idt = 4*(n-1);
2370: ai16 = 16*diag[n-1];
2371: v = aa + ai16 + 16;
2372: for (i=n-1; i>=0;){
2373: PREFETCH_NTA(&v[8]);
2374: vi = aj + diag[i] + 1;
2375: nz = ai[i+1] - diag[i] - 1;
2376:
2377: LOAD_PS(&t[idt],XMM7);
2379: while (nz--) {
2380: PREFETCH_NTA(&v[16]);
2381: idx = 4*((unsigned int)(*vi++));
2383: /* 4x4 Matrix-Vector Product with negative accumulation: */
2384: SSE_INLINE_BEGIN_2(&t[idx],v)
2385: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2387: /* First Column */
2388: SSE_COPY_PS(XMM0,XMM6)
2389: SSE_SHUFFLE(XMM0,XMM0,0x00)
2390: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2391: SSE_SUB_PS(XMM7,XMM0)
2393: /* Second Column */
2394: SSE_COPY_PS(XMM1,XMM6)
2395: SSE_SHUFFLE(XMM1,XMM1,0x55)
2396: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2397: SSE_SUB_PS(XMM7,XMM1)
2399: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2400:
2401: /* Third Column */
2402: SSE_COPY_PS(XMM2,XMM6)
2403: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2404: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2405: SSE_SUB_PS(XMM7,XMM2)
2407: /* Fourth Column */
2408: SSE_COPY_PS(XMM3,XMM6)
2409: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2410: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2411: SSE_SUB_PS(XMM7,XMM3)
2412: SSE_INLINE_END_2
2413: v += 16;
2414: }
2415: v = aa + ai16;
2416: ai16 = 16*diag[--i];
2417: PREFETCH_NTA(aa+ai16+16);
2418: /*
2419: Scale the result by the diagonal 4x4 block,
2420: which was inverted as part of the factorization
2421: */
2422: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2423: /* First Column */
2424: SSE_COPY_PS(XMM0,XMM7)
2425: SSE_SHUFFLE(XMM0,XMM0,0x00)
2426: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2428: /* Second Column */
2429: SSE_COPY_PS(XMM1,XMM7)
2430: SSE_SHUFFLE(XMM1,XMM1,0x55)
2431: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2432: SSE_ADD_PS(XMM0,XMM1)
2434: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2435:
2436: /* Third Column */
2437: SSE_COPY_PS(XMM2,XMM7)
2438: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2439: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2440: SSE_ADD_PS(XMM0,XMM2)
2442: /* Fourth Column */
2443: SSE_COPY_PS(XMM3,XMM7)
2444: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2445: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2446: SSE_ADD_PS(XMM0,XMM3)
2448: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2449: SSE_INLINE_END_3
2451: v = aa + ai16 + 16;
2452: idt -= 4;
2453: }
2455: /* Convert t from single precision back to double precision (inplace)*/
2456: idt = 4*(n-1);
2457: for (i=n-1;i>=0;i--) {
2458: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2459: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2460: PetscScalar *xtemp=&x[idt];
2461: MatScalar *ttemp=&t[idt];
2462: xtemp[3] = (PetscScalar)ttemp[3];
2463: xtemp[2] = (PetscScalar)ttemp[2];
2464: xtemp[1] = (PetscScalar)ttemp[1];
2465: xtemp[0] = (PetscScalar)ttemp[0];
2466: idt -= 4;
2467: }
2469: } /* End of artificial scope. */
2470: VecRestoreArray(bb,&b);
2471: VecRestoreArray(xx,&x);
2472: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2473: SSE_SCOPE_END;
2474: return(0);
2475: }
2479: PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2480: {
2481: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2482: int *aj=a->j;
2484: int *ai=a->i,n=a->mbs,*diag = a->diag;
2485: MatScalar *aa=a->a;
2486: PetscScalar *x,*b;
2489: SSE_SCOPE_BEGIN;
2490: /*
2491: Note: This code currently uses demotion of double
2492: to float when performing the mixed-mode computation.
2493: This may not be numerically reasonable for all applications.
2494: */
2495: PREFETCH_NTA(aa+16*ai[1]);
2497: VecGetArray(bb,&b);
2498: VecGetArray(xx,&x);
2499: {
2500: /* x will first be computed in single precision then promoted inplace to double */
2501: MatScalar *v,*t=(MatScalar *)x;
2502: int nz,i,idt,ai16;
2503: int jdx,idx;
2504: int *vi;
2505: /* Forward solve the lower triangular factor. */
2507: /* First block is the identity. */
2508: idx = 0;
2509: CONVERT_DOUBLE4_FLOAT4(t,b);
2510: v = aa + 16*ai[1];
2512: for (i=1; i<n;) {
2513: PREFETCH_NTA(&v[8]);
2514: vi = aj + ai[i];
2515: nz = diag[i] - ai[i];
2516: idx += 4;
2518: /* Demote RHS from double to float. */
2519: CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2520: LOAD_PS(&t[idx],XMM7);
2522: while (nz--) {
2523: PREFETCH_NTA(&v[16]);
2524: jdx = 4*(*vi++);
2525: /* jdx = *vi++; */
2526:
2527: /* 4x4 Matrix-Vector product with negative accumulation: */
2528: SSE_INLINE_BEGIN_2(&t[jdx],v)
2529: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2531: /* First Column */
2532: SSE_COPY_PS(XMM0,XMM6)
2533: SSE_SHUFFLE(XMM0,XMM0,0x00)
2534: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2535: SSE_SUB_PS(XMM7,XMM0)
2537: /* Second Column */
2538: SSE_COPY_PS(XMM1,XMM6)
2539: SSE_SHUFFLE(XMM1,XMM1,0x55)
2540: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2541: SSE_SUB_PS(XMM7,XMM1)
2543: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2544:
2545: /* Third Column */
2546: SSE_COPY_PS(XMM2,XMM6)
2547: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2548: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2549: SSE_SUB_PS(XMM7,XMM2)
2551: /* Fourth Column */
2552: SSE_COPY_PS(XMM3,XMM6)
2553: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2554: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2555: SSE_SUB_PS(XMM7,XMM3)
2556: SSE_INLINE_END_2
2557:
2558: v += 16;
2559: }
2560: v = aa + 16*ai[++i];
2561: PREFETCH_NTA(v);
2562: STORE_PS(&t[idx],XMM7);
2563: }
2565: /* Backward solve the upper triangular factor.*/
2567: idt = 4*(n-1);
2568: ai16 = 16*diag[n-1];
2569: v = aa + ai16 + 16;
2570: for (i=n-1; i>=0;){
2571: PREFETCH_NTA(&v[8]);
2572: vi = aj + diag[i] + 1;
2573: nz = ai[i+1] - diag[i] - 1;
2574:
2575: LOAD_PS(&t[idt],XMM7);
2577: while (nz--) {
2578: PREFETCH_NTA(&v[16]);
2579: idx = 4*(*vi++);
2580: /* idx = *vi++; */
2582: /* 4x4 Matrix-Vector Product with negative accumulation: */
2583: SSE_INLINE_BEGIN_2(&t[idx],v)
2584: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2586: /* First Column */
2587: SSE_COPY_PS(XMM0,XMM6)
2588: SSE_SHUFFLE(XMM0,XMM0,0x00)
2589: SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2590: SSE_SUB_PS(XMM7,XMM0)
2592: /* Second Column */
2593: SSE_COPY_PS(XMM1,XMM6)
2594: SSE_SHUFFLE(XMM1,XMM1,0x55)
2595: SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2596: SSE_SUB_PS(XMM7,XMM1)
2598: SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2599:
2600: /* Third Column */
2601: SSE_COPY_PS(XMM2,XMM6)
2602: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2603: SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2604: SSE_SUB_PS(XMM7,XMM2)
2606: /* Fourth Column */
2607: SSE_COPY_PS(XMM3,XMM6)
2608: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2609: SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2610: SSE_SUB_PS(XMM7,XMM3)
2611: SSE_INLINE_END_2
2612: v += 16;
2613: }
2614: v = aa + ai16;
2615: ai16 = 16*diag[--i];
2616: PREFETCH_NTA(aa+ai16+16);
2617: /*
2618: Scale the result by the diagonal 4x4 block,
2619: which was inverted as part of the factorization
2620: */
2621: SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2622: /* First Column */
2623: SSE_COPY_PS(XMM0,XMM7)
2624: SSE_SHUFFLE(XMM0,XMM0,0x00)
2625: SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2627: /* Second Column */
2628: SSE_COPY_PS(XMM1,XMM7)
2629: SSE_SHUFFLE(XMM1,XMM1,0x55)
2630: SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2631: SSE_ADD_PS(XMM0,XMM1)
2633: SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2634:
2635: /* Third Column */
2636: SSE_COPY_PS(XMM2,XMM7)
2637: SSE_SHUFFLE(XMM2,XMM2,0xAA)
2638: SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2639: SSE_ADD_PS(XMM0,XMM2)
2641: /* Fourth Column */
2642: SSE_COPY_PS(XMM3,XMM7)
2643: SSE_SHUFFLE(XMM3,XMM3,0xFF)
2644: SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2645: SSE_ADD_PS(XMM0,XMM3)
2647: SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2648: SSE_INLINE_END_3
2650: v = aa + ai16 + 16;
2651: idt -= 4;
2652: }
2654: /* Convert t from single precision back to double precision (inplace)*/
2655: idt = 4*(n-1);
2656: for (i=n-1;i>=0;i--) {
2657: /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2658: /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2659: PetscScalar *xtemp=&x[idt];
2660: MatScalar *ttemp=&t[idt];
2661: xtemp[3] = (PetscScalar)ttemp[3];
2662: xtemp[2] = (PetscScalar)ttemp[2];
2663: xtemp[1] = (PetscScalar)ttemp[1];
2664: xtemp[0] = (PetscScalar)ttemp[0];
2665: idt -= 4;
2666: }
2668: } /* End of artificial scope. */
2669: VecRestoreArray(bb,&b);
2670: VecRestoreArray(xx,&x);
2671: PetscLogFlops(2*16*(a->nz) - 4*A->cmap.n);
2672: SSE_SCOPE_END;
2673: return(0);
2674: }
2676: #endif
2679: PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2680: {
2681: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2682: IS iscol=a->col,isrow=a->row;
2684: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2685: PetscInt *diag = a->diag;
2686: MatScalar *aa=a->a,*v;
2687: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t;
2690: VecGetArray(bb,&b);
2691: VecGetArray(xx,&x);
2692: t = a->solve_work;
2694: ISGetIndices(isrow,&rout); r = rout;
2695: ISGetIndices(iscol,&cout); c = cout + (n-1);
2697: /* forward solve the lower triangular */
2698: idx = 3*(*r++);
2699: t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2700: for (i=1; i<n; i++) {
2701: v = aa + 9*ai[i];
2702: vi = aj + ai[i];
2703: nz = diag[i] - ai[i];
2704: idx = 3*(*r++);
2705: s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2706: while (nz--) {
2707: idx = 3*(*vi++);
2708: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2709: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2710: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2711: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2712: v += 9;
2713: }
2714: idx = 3*i;
2715: t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2716: }
2717: /* backward solve the upper triangular */
2718: for (i=n-1; i>=0; i--){
2719: v = aa + 9*diag[i] + 9;
2720: vi = aj + diag[i] + 1;
2721: nz = ai[i+1] - diag[i] - 1;
2722: idt = 3*i;
2723: s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2724: while (nz--) {
2725: idx = 3*(*vi++);
2726: x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2727: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2728: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2729: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2730: v += 9;
2731: }
2732: idc = 3*(*c--);
2733: v = aa + 9*diag[i];
2734: x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2735: x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2736: x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2737: }
2738: ISRestoreIndices(isrow,&rout);
2739: ISRestoreIndices(iscol,&cout);
2740: VecRestoreArray(bb,&b);
2741: VecRestoreArray(xx,&x);
2742: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2743: return(0);
2744: }
2746: /*
2747: Special case where the matrix was ILU(0) factored in the natural
2748: ordering. This eliminates the need for the column and row permutation.
2749: */
2752: PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2753: {
2754: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2755: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2757: PetscInt *diag = a->diag;
2758: MatScalar *aa=a->a,*v;
2759: PetscScalar *x,*b,s1,s2,s3,x1,x2,x3;
2760: PetscInt jdx,idt,idx,nz,*vi,i;
2763: VecGetArray(bb,&b);
2764: VecGetArray(xx,&x);
2767: /* forward solve the lower triangular */
2768: idx = 0;
2769: x[0] = b[0]; x[1] = b[1]; x[2] = b[2];
2770: for (i=1; i<n; i++) {
2771: v = aa + 9*ai[i];
2772: vi = aj + ai[i];
2773: nz = diag[i] - ai[i];
2774: idx += 3;
2775: s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];
2776: while (nz--) {
2777: jdx = 3*(*vi++);
2778: x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2779: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2780: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2781: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2782: v += 9;
2783: }
2784: x[idx] = s1;
2785: x[1+idx] = s2;
2786: x[2+idx] = s3;
2787: }
2788: /* backward solve the upper triangular */
2789: for (i=n-1; i>=0; i--){
2790: v = aa + 9*diag[i] + 9;
2791: vi = aj + diag[i] + 1;
2792: nz = ai[i+1] - diag[i] - 1;
2793: idt = 3*i;
2794: s1 = x[idt]; s2 = x[1+idt];
2795: s3 = x[2+idt];
2796: while (nz--) {
2797: idx = 3*(*vi++);
2798: x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx];
2799: s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2800: s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2801: s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2802: v += 9;
2803: }
2804: v = aa + 9*diag[i];
2805: x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3;
2806: x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2807: x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2808: }
2810: VecRestoreArray(bb,&b);
2811: VecRestoreArray(xx,&x);
2812: PetscLogFlops(2*9*(a->nz) - 3*A->cmap.n);
2813: return(0);
2814: }
2818: PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2819: {
2820: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2821: IS iscol=a->col,isrow=a->row;
2823: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2824: PetscInt *diag = a->diag;
2825: MatScalar *aa=a->a,*v;
2826: PetscScalar *x,*b,s1,s2,x1,x2,*t;
2829: VecGetArray(bb,&b);
2830: VecGetArray(xx,&x);
2831: t = a->solve_work;
2833: ISGetIndices(isrow,&rout); r = rout;
2834: ISGetIndices(iscol,&cout); c = cout + (n-1);
2836: /* forward solve the lower triangular */
2837: idx = 2*(*r++);
2838: t[0] = b[idx]; t[1] = b[1+idx];
2839: for (i=1; i<n; i++) {
2840: v = aa + 4*ai[i];
2841: vi = aj + ai[i];
2842: nz = diag[i] - ai[i];
2843: idx = 2*(*r++);
2844: s1 = b[idx]; s2 = b[1+idx];
2845: while (nz--) {
2846: idx = 2*(*vi++);
2847: x1 = t[idx]; x2 = t[1+idx];
2848: s1 -= v[0]*x1 + v[2]*x2;
2849: s2 -= v[1]*x1 + v[3]*x2;
2850: v += 4;
2851: }
2852: idx = 2*i;
2853: t[idx] = s1; t[1+idx] = s2;
2854: }
2855: /* backward solve the upper triangular */
2856: for (i=n-1; i>=0; i--){
2857: v = aa + 4*diag[i] + 4;
2858: vi = aj + diag[i] + 1;
2859: nz = ai[i+1] - diag[i] - 1;
2860: idt = 2*i;
2861: s1 = t[idt]; s2 = t[1+idt];
2862: while (nz--) {
2863: idx = 2*(*vi++);
2864: x1 = t[idx]; x2 = t[1+idx];
2865: s1 -= v[0]*x1 + v[2]*x2;
2866: s2 -= v[1]*x1 + v[3]*x2;
2867: v += 4;
2868: }
2869: idc = 2*(*c--);
2870: v = aa + 4*diag[i];
2871: x[idc] = t[idt] = v[0]*s1 + v[2]*s2;
2872: x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
2873: }
2874: ISRestoreIndices(isrow,&rout);
2875: ISRestoreIndices(iscol,&cout);
2876: VecRestoreArray(bb,&b);
2877: VecRestoreArray(xx,&x);
2878: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2879: return(0);
2880: }
2882: /*
2883: Special case where the matrix was ILU(0) factored in the natural
2884: ordering. This eliminates the need for the column and row permutation.
2885: */
2888: PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
2889: {
2890: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2891: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
2893: PetscInt *diag = a->diag;
2894: MatScalar *aa=a->a,*v;
2895: PetscScalar *x,*b,s1,s2,x1,x2;
2896: PetscInt jdx,idt,idx,nz,*vi,i;
2899: VecGetArray(bb,&b);
2900: VecGetArray(xx,&x);
2902: /* forward solve the lower triangular */
2903: idx = 0;
2904: x[0] = b[0]; x[1] = b[1];
2905: for (i=1; i<n; i++) {
2906: v = aa + 4*ai[i];
2907: vi = aj + ai[i];
2908: nz = diag[i] - ai[i];
2909: idx += 2;
2910: s1 = b[idx];s2 = b[1+idx];
2911: while (nz--) {
2912: jdx = 2*(*vi++);
2913: x1 = x[jdx];x2 = x[1+jdx];
2914: s1 -= v[0]*x1 + v[2]*x2;
2915: s2 -= v[1]*x1 + v[3]*x2;
2916: v += 4;
2917: }
2918: x[idx] = s1;
2919: x[1+idx] = s2;
2920: }
2921: /* backward solve the upper triangular */
2922: for (i=n-1; i>=0; i--){
2923: v = aa + 4*diag[i] + 4;
2924: vi = aj + diag[i] + 1;
2925: nz = ai[i+1] - diag[i] - 1;
2926: idt = 2*i;
2927: s1 = x[idt]; s2 = x[1+idt];
2928: while (nz--) {
2929: idx = 2*(*vi++);
2930: x1 = x[idx]; x2 = x[1+idx];
2931: s1 -= v[0]*x1 + v[2]*x2;
2932: s2 -= v[1]*x1 + v[3]*x2;
2933: v += 4;
2934: }
2935: v = aa + 4*diag[i];
2936: x[idt] = v[0]*s1 + v[2]*s2;
2937: x[1+idt] = v[1]*s1 + v[3]*s2;
2938: }
2940: VecRestoreArray(bb,&b);
2941: VecRestoreArray(xx,&x);
2942: PetscLogFlops(2*4*(a->nz) - 2*A->cmap.n);
2943: return(0);
2944: }
2948: PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
2949: {
2950: Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data;
2951: IS iscol=a->col,isrow=a->row;
2953: PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2954: PetscInt *diag = a->diag;
2955: MatScalar *aa=a->a,*v;
2956: PetscScalar *x,*b,s1,*t;
2959: if (!n) return(0);
2961: VecGetArray(bb,&b);
2962: VecGetArray(xx,&x);
2963: t = a->solve_work;
2965: ISGetIndices(isrow,&rout); r = rout;
2966: ISGetIndices(iscol,&cout); c = cout + (n-1);
2968: /* forward solve the lower triangular */
2969: t[0] = b[*r++];
2970: for (i=1; i<n; i++) {
2971: v = aa + ai[i];
2972: vi = aj + ai[i];
2973: nz = diag[i] - ai[i];
2974: s1 = b[*r++];
2975: while (nz--) {
2976: s1 -= (*v++)*t[*vi++];
2977: }
2978: t[i] = s1;
2979: }
2980: /* backward solve the upper triangular */
2981: for (i=n-1; i>=0; i--){
2982: v = aa + diag[i] + 1;
2983: vi = aj + diag[i] + 1;
2984: nz = ai[i+1] - diag[i] - 1;
2985: s1 = t[i];
2986: while (nz--) {
2987: s1 -= (*v++)*t[*vi++];
2988: }
2989: x[*c--] = t[i] = aa[diag[i]]*s1;
2990: }
2992: ISRestoreIndices(isrow,&rout);
2993: ISRestoreIndices(iscol,&cout);
2994: VecRestoreArray(bb,&b);
2995: VecRestoreArray(xx,&x);
2996: PetscLogFlops(2*1*(a->nz) - A->cmap.n);
2997: return(0);
2998: }
2999: /*
3000: Special case where the matrix was ILU(0) factored in the natural
3001: ordering. This eliminates the need for the column and row permutation.
3002: */
3005: PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3006: {
3007: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3008: PetscInt n=a->mbs,*ai=a->i,*aj=a->j;
3010: PetscInt *diag = a->diag;
3011: MatScalar *aa=a->a;
3012: PetscScalar *x,*b;
3013: PetscScalar s1,x1;
3014: MatScalar *v;
3015: PetscInt jdx,idt,idx,nz,*vi,i;
3018: VecGetArray(bb,&b);
3019: VecGetArray(xx,&x);
3021: /* forward solve the lower triangular */
3022: idx = 0;
3023: x[0] = b[0];
3024: for (i=1; i<n; i++) {
3025: v = aa + ai[i];
3026: vi = aj + ai[i];
3027: nz = diag[i] - ai[i];
3028: idx += 1;
3029: s1 = b[idx];
3030: while (nz--) {
3031: jdx = *vi++;
3032: x1 = x[jdx];
3033: s1 -= v[0]*x1;
3034: v += 1;
3035: }
3036: x[idx] = s1;
3037: }
3038: /* backward solve the upper triangular */
3039: for (i=n-1; i>=0; i--){
3040: v = aa + diag[i] + 1;
3041: vi = aj + diag[i] + 1;
3042: nz = ai[i+1] - diag[i] - 1;
3043: idt = i;
3044: s1 = x[idt];
3045: while (nz--) {
3046: idx = *vi++;
3047: x1 = x[idx];
3048: s1 -= v[0]*x1;
3049: v += 1;
3050: }
3051: v = aa + diag[i];
3052: x[idt] = v[0]*s1;
3053: }
3054: VecRestoreArray(bb,&b);
3055: VecRestoreArray(xx,&x);
3056: PetscLogFlops(2*(a->nz) - A->cmap.n);
3057: return(0);
3058: }
3060: /* ----------------------------------------------------------------*/
3061: /*
3062: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3063: except that the data structure of Mat_SeqAIJ is slightly different.
3064: Not a good example of code reuse.
3065: */
3066: EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);
3070: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
3071: {
3072: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
3073: IS isicol;
3075: PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3076: PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3077: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3078: PetscInt incrlev,nnz,i,bs = A->rmap.bs,bs2 = a->bs2,levels,diagonal_fill;
3079: PetscTruth col_identity,row_identity;
3080: PetscReal f;
3083: f = info->fill;
3084: levels = (PetscInt)info->levels;
3085: diagonal_fill = (PetscInt)info->diagonal_fill;
3086: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
3087: ISIdentity(isrow,&row_identity);
3088: ISIdentity(iscol,&col_identity);
3090: if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */
3091: MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);
3092: (*fact)->factor = FACTOR_LU;
3093: b = (Mat_SeqBAIJ*)(*fact)->data;
3094: if (!b->diag) {
3095: MatMarkDiagonal_SeqBAIJ(*fact);
3096: }
3097: MatMissingDiagonal_SeqBAIJ(*fact);
3098: b->row = isrow;
3099: b->col = iscol;
3100: PetscObjectReference((PetscObject)isrow);
3101: PetscObjectReference((PetscObject)iscol);
3102: b->icol = isicol;
3103: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3104: PetscMalloc(((*fact)->rmap.N+1+(*fact)->rmap.bs)*sizeof(PetscScalar),&b->solve_work);
3105: } else { /* general case perform the symbolic factorization */
3106: ISGetIndices(isrow,&r);
3107: ISGetIndices(isicol,&ic);
3109: /* get new row pointers */
3110: PetscMalloc((n+1)*sizeof(PetscInt),&ainew);
3111: ainew[0] = 0;
3112: /* don't know how many column pointers are needed so estimate */
3113: jmax = (PetscInt)(f*ai[n] + 1);
3114: PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);
3115: /* ajfill is level of fill for each fill entry */
3116: PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);
3117: /* fill is a linked list of nonzeros in active row */
3118: PetscMalloc((n+1)*sizeof(PetscInt),&fill);
3119: /* im is level for each filled value */
3120: PetscMalloc((n+1)*sizeof(PetscInt),&im);
3121: /* dloc is location of diagonal in factor */
3122: PetscMalloc((n+1)*sizeof(PetscInt),&dloc);
3123: dloc[0] = 0;
3124: for (prow=0; prow<n; prow++) {
3126: /* copy prow into linked list */
3127: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
3128: if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
3129: xi = aj + ai[r[prow]];
3130: fill[n] = n;
3131: fill[prow] = -1; /* marker for diagonal entry */
3132: while (nz--) {
3133: fm = n;
3134: idx = ic[*xi++];
3135: do {
3136: m = fm;
3137: fm = fill[m];
3138: } while (fm < idx);
3139: fill[m] = idx;
3140: fill[idx] = fm;
3141: im[idx] = 0;
3142: }
3144: /* make sure diagonal entry is included */
3145: if (diagonal_fill && fill[prow] == -1) {
3146: fm = n;
3147: while (fill[fm] < prow) fm = fill[fm];
3148: fill[prow] = fill[fm]; /* insert diagonal into linked list */
3149: fill[fm] = prow;
3150: im[prow] = 0;
3151: nzf++;
3152: dcount++;
3153: }
3155: nzi = 0;
3156: row = fill[n];
3157: while (row < prow) {
3158: incrlev = im[row] + 1;
3159: nz = dloc[row];
3160: xi = ajnew + ainew[row] + nz + 1;
3161: flev = ajfill + ainew[row] + nz + 1;
3162: nnz = ainew[row+1] - ainew[row] - nz - 1;
3163: fm = row;
3164: while (nnz-- > 0) {
3165: idx = *xi++;
3166: if (*flev + incrlev > levels) {
3167: flev++;
3168: continue;
3169: }
3170: do {
3171: m = fm;
3172: fm = fill[m];
3173: } while (fm < idx);
3174: if (fm != idx) {
3175: im[idx] = *flev + incrlev;
3176: fill[m] = idx;
3177: fill[idx] = fm;
3178: fm = idx;
3179: nzf++;
3180: } else {
3181: if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3182: }
3183: flev++;
3184: }
3185: row = fill[row];
3186: nzi++;
3187: }
3188: /* copy new filled row into permanent storage */
3189: ainew[prow+1] = ainew[prow] + nzf;
3190: if (ainew[prow+1] > jmax) {
3192: /* estimate how much additional space we will need */
3193: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3194: /* just double the memory each time */
3195: PetscInt maxadd = jmax;
3196: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3197: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3198: jmax += maxadd;
3200: /* allocate a longer ajnew and ajfill */
3201: PetscMalloc(jmax*sizeof(PetscInt),&xi);
3202: PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));
3203: PetscFree(ajnew);
3204: ajnew = xi;
3205: PetscMalloc(jmax*sizeof(PetscInt),&xi);
3206: PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));
3207: PetscFree(ajfill);
3208: ajfill = xi;
3209: reallocate++; /* count how many reallocations are needed */
3210: }
3211: xi = ajnew + ainew[prow];
3212: flev = ajfill + ainew[prow];
3213: dloc[prow] = nzi;
3214: fm = fill[n];
3215: while (nzf--) {
3216: *xi++ = fm;
3217: *flev++ = im[fm];
3218: fm = fill[fm];
3219: }
3220: /* make sure row has diagonal entry */
3221: if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3222: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3223: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3224: }
3225: }
3226: PetscFree(ajfill);
3227: ISRestoreIndices(isrow,&r);
3228: ISRestoreIndices(isicol,&ic);
3229: PetscFree(fill);
3230: PetscFree(im);
3232: #if defined(PETSC_USE_INFO)
3233: {
3234: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3235: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);
3236: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
3237: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
3238: PetscInfo(A,"for best performance.\n");
3239: if (diagonal_fill) {
3240: PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);
3241: }
3242: }
3243: #endif
3245: /* put together the new matrix */
3246: MatCreate(A->comm,fact);
3247: MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);
3248: MatSetType(*fact,A->type_name);
3249: MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);
3250: PetscLogObjectParent(*fact,isicol);
3251: b = (Mat_SeqBAIJ*)(*fact)->data;
3252: b->singlemalloc = PETSC_FALSE;
3253: PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);
3254: b->j = ajnew;
3255: b->i = ainew;
3256: for (i=0; i<n; i++) dloc[i] += ainew[i];
3257: b->diag = dloc;
3258: b->ilen = 0;
3259: b->imax = 0;
3260: b->row = isrow;
3261: b->col = iscol;
3262: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3263: PetscObjectReference((PetscObject)isrow);
3264: PetscObjectReference((PetscObject)iscol);
3265: b->icol = isicol;
3266: PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);
3267: /* In b structure: Free imax, ilen, old a, old j.
3268: Allocate dloc, solve_work, new a, new j */
3269: PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
3270: b->maxnz = b->nz = ainew[n];
3271: (*fact)->factor = FACTOR_LU;
3273: (*fact)->info.factor_mallocs = reallocate;
3274: (*fact)->info.fill_ratio_given = f;
3275: (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3276: }
3278: if (row_identity && col_identity) {
3279: MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);
3280: }
3281: return(0);
3282: }
3286: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3287: {
3288: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3289: /* int i,*AJ=a->j,nz=a->nz; */
3291: /* Undo Column scaling */
3292: /* while (nz--) { */
3293: /* AJ[i] = AJ[i]/4; */
3294: /* } */
3295: /* This should really invoke a push/pop logic, but we don't have that yet. */
3296: A->ops->setunfactored = PETSC_NULL;
3297: return(0);
3298: }
3302: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3303: {
3304: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3305: PetscInt *AJ=a->j,nz=a->nz;
3306: unsigned short *aj=(unsigned short *)AJ;
3308: /* Is this really necessary? */
3309: while (nz--) {
3310: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3311: }
3312: A->ops->setunfactored = PETSC_NULL;
3313: return(0);
3314: }
3318: PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
3319: {
3321: /*
3322: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3323: with natural ordering
3324: */
3326: inA->ops->solve = MatSolve_SeqBAIJ_Update;
3327: inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update;
3328: switch (inA->rmap.bs) {
3329: case 1:
3330: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3331: PetscInfo(inA,"Using special in-place natural ordering factor BS=1\n");
3332: break;
3333: case 2:
3334: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3335: PetscInfo(inA,"Using special in-place natural ordering factor BS=2\n");
3336: break;
3337: case 3:
3338: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3339: PetscInfo(inA,"Using special in-place natural ordering factor BS=3\n");
3340: break;
3341: case 4:
3342: #if defined(PETSC_USE_MAT_SINGLE)
3343: {
3344: PetscTruth sse_enabled_local;
3346: PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);
3347: if (sse_enabled_local) {
3348: # if defined(PETSC_HAVE_SSE)
3349: int i,*AJ=a->j,nz=a->nz,n=a->mbs;
3350: if (n==(unsigned short)n) {
3351: unsigned short *aj=(unsigned short *)AJ;
3352: for (i=0;i<nz;i++) {
3353: aj[i] = (unsigned short)AJ[i];
3354: }
3355: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3356: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
3357: PetscInfo(inA,"Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
3358: } else {
3359: /* Scale the column indices for easier indexing in MatSolve. */
3360: /* for (i=0;i<nz;i++) { */
3361: /* AJ[i] = AJ[i]*4; */
3362: /* } */
3363: inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
3364: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3365: PetscInfo(inA,"Using special SSE, in-place natural ordering, int j index factor BS=4\n");
3366: }
3367: # else
3368: /* This should never be reached. If so, problem in PetscSSEIsEnabled. */
3369: SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3370: # endif
3371: } else {
3372: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3373: PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3374: }
3375: }
3376: #else
3377: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3378: PetscInfo(inA,"Using special in-place natural ordering factor BS=4\n");
3379: #endif
3380: break;
3381: case 5:
3382: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3383: PetscInfo(inA,"Using special in-place natural ordering factor BS=5\n");
3384: break;
3385: case 6:
3386: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3387: PetscInfo(inA,"Using special in-place natural ordering factor BS=6\n");
3388: break;
3389: case 7:
3390: inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3391: PetscInfo(inA,"Using special in-place natural ordering factor BS=7\n");
3392: break;
3393: }
3394: return(0);
3395: }
3399: PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3400: {
3401: /*
3402: Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3403: with natural ordering
3404: */
3405: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
3406: IS row = a->row, col = a->col;
3407: PetscTruth row_identity, col_identity;
3408: PetscTruth use_natural;
3413: use_natural = PETSC_FALSE;
3414: if (row && col) {
3415: ISIdentity(row,&row_identity);
3416: ISIdentity(col,&col_identity);
3418: if (row_identity && col_identity) {
3419: use_natural = PETSC_TRUE;
3420: }
3421: } else {
3422: use_natural = PETSC_TRUE;
3423: }
3425: switch (A->rmap.bs) {
3426: case 1:
3427: if (use_natural) {
3428: A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
3429: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3430: PetscInfo(A,"Using special in-place natural ordering solve BS=1\n");
3431: PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3432: } else {
3433: A->ops->solve = MatSolve_SeqBAIJ_1;
3434: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
3435: }
3436: break;
3437: case 2:
3438: if (use_natural) {
3439: A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
3440: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3441: PetscInfo(A,"Using special in-place natural ordering solve BS=2\n");
3442: PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3443: } else {
3444: A->ops->solve = MatSolve_SeqBAIJ_2;
3445: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
3446: }
3447: break;
3448: case 3:
3449: if (use_natural) {
3450: A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
3451: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3452: PetscInfo(A,"Using special in-place natural ordering solve BS=3\n");
3453: PetscInfo(A,"Using special in-place natural ordering solve BS=4\n");
3454: } else {
3455: A->ops->solve = MatSolve_SeqBAIJ_3;
3456: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
3457: }
3458: break;
3459: case 4:
3460: {
3461: PetscTruth sse_enabled_local;
3462: PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);
3463: if (use_natural) {
3464: #if defined(PETSC_USE_MAT_SINGLE)
3465: if (sse_enabled_local) { /* Natural + Single + SSE */
3466: # if defined(PETSC_HAVE_SSE)
3467: PetscInt n=a->mbs;
3468: if (n==(unsigned short)n) {
3469: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3470: PetscInfo(A,"Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3471: } else {
3472: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3473: PetscInfo(A,"Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3474: }
3475: # else
3476: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3477: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3478: # endif
3479: } else { /* Natural + Single */
3480: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3481: PetscInfo(A,"Using single precision, in-place, natural ordering solve BS=4\n");
3482: }
3483: #else
3484: A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
3485: PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3486: #endif
3487: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3488: PetscInfo(A,"Using special in-place, natural ordering solve BS=4\n");
3489: } else { /* Arbitrary ordering */
3490: #if defined(PETSC_USE_MAT_SINGLE)
3491: if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3492: # if defined(PETSC_HAVE_SSE)
3493: A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion;
3494: PetscInfo(A,"Using single precision, SSE solve BS=4\n");
3495: # else
3496: /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3497: SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3498: # endif
3499: } else { /* Arbitrary + Single */
3500: A->ops->solve = MatSolve_SeqBAIJ_4_Demotion;
3501: PetscInfo(A,"Using single precision solve BS=4\n");
3502: }
3503: #else
3504: A->ops->solve = MatSolve_SeqBAIJ_4;
3505: #endif
3506: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
3507: }
3508: }
3509: break;
3510: case 5:
3511: if (use_natural) {
3512: A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering;
3513: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3514: PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3515: PetscInfo(A,"Using special in-place natural ordering solve BS=5\n");
3516: } else {
3517: A->ops->solve = MatSolve_SeqBAIJ_5;
3518: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5;
3519: }
3520: break;
3521: case 6:
3522: if (use_natural) {
3523: A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering;
3524: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3525: PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3526: PetscInfo(A,"Using special in-place natural ordering solve BS=6\n");
3527: } else {
3528: A->ops->solve = MatSolve_SeqBAIJ_6;
3529: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6;
3530: }
3531: break;
3532: case 7:
3533: if (use_natural) {
3534: A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering;
3535: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3536: PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3537: PetscInfo(A,"Using special in-place natural ordering solve BS=7\n");
3538: } else {
3539: A->ops->solve = MatSolve_SeqBAIJ_7;
3540: A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7;
3541: }
3542: break;
3543: default:
3544: A->ops->solve = MatSolve_SeqBAIJ_N;
3545: break;
3546: }
3547: return(0);
3548: }
3552: PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3553: {
3557: MatSeqBAIJ_UpdateSolvers(A);
3558: if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3559: (*A->ops->solve)(A,x,y);
3560: } else {
3561: SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3562: }
3563: return(0);
3564: }
3568: PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y)
3569: {
3573: MatSeqBAIJ_UpdateSolvers(A);
3574: (*A->ops->solvetranspose)(A,x,y);
3575: return(0);
3576: }