-
Notifications
You must be signed in to change notification settings - Fork 60
/
pstrord.f
3460 lines (3460 loc) · 158 KB
/
pstrord.f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
SUBROUTINE PSTRORD( COMPQ, SELECT, PARA, N, T, IT, JT,
$ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, WORK, LWORK,
$ IWORK, LIWORK, INFO )
*
* Contribution from the Department of Computing Science and HPC2N,
* Umea University, Sweden
*
* -- ScaLAPACK computational routine (version 2.0.2) --
* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
* May 1 2012
*
IMPLICIT NONE
*
* .. Scalar Arguments ..
CHARACTER COMPQ
INTEGER INFO, LIWORK, LWORK, M, N,
$ IT, JT, IQ, JQ
* ..
* .. Array Arguments ..
INTEGER SELECT( * )
INTEGER PARA( 6 ), DESCT( * ), DESCQ( * ), IWORK( * )
REAL Q( * ), T( * ), WI( * ), WORK( * ), WR( * )
* ..
*
* Purpose
* =======
*
* PSTRORD reorders the real Schur factorization of a real matrix
* A = Q*T*Q**T, so that a selected cluster of eigenvalues appears
* in the leading diagonal blocks of the upper quasi-triangular matrix
* T, and the leading columns of Q form an orthonormal basis of the
* corresponding right invariant subspace.
*
* T must be in Schur form (as returned by PSLAHQR), that is, block
* upper triangular with 1-by-1 and 2-by-2 diagonal blocks.
*
* This subroutine uses a delay and accumulate procedure for performing
* the off-diagonal updates (see references for details).
*
* Notes
* =====
*
* Each global data object is described by an associated description
* vector. This vector stores the information required to establish
* the mapping between an object element and its corresponding process
* and memory location.
*
* Let A be a generic term for any 2D block cyclicly distributed array.
* Such a global array has an associated description vector DESCA.
* In the following comments, the character _ should be read as
* "of the global array".
*
* NOTATION STORED IN EXPLANATION
* --------------- -------------- --------------------------------------
* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
* DTYPE_A = 1.
* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
* the BLACS process grid A is distribu-
* ted over. The context itself is glo-
* bal, but the handle (the integer
* value) may vary.
* M_A (global) DESCA( M_ ) The number of rows in the global
* array A.
* N_A (global) DESCA( N_ ) The number of columns in the global
* array A.
* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
* the rows of the array.
* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
* the columns of the array.
* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
* row of the array A is distributed.
* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
* first column of the array A is
* distributed.
* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
* array. LLD_A >= MAX(1,LOCr(M_A)).
*
* Let K be the number of rows or columns of a distributed matrix,
* and assume that its process grid has dimension p x q.
* LOCr( K ) denotes the number of elements of K that a process
* would receive if K were distributed over the p processes of its
* process column.
* Similarly, LOCc( K ) denotes the number of elements of K that a
* process would receive if K were distributed over the q processes of
* its process row.
* The values of LOCr() and LOCc() may be determined via a call to the
* ScaLAPACK tool function, NUMROC:
* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
* An upper bound for these quantities may be computed by:
* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
*
* Arguments
* =========
*
*
* COMPQ (global input) CHARACTER*1
* = 'V': update the matrix Q of Schur vectors;
* = 'N': do not update Q.
*
* SELECT (global input/output) INTEGER array, dimension (N)
* SELECT specifies the eigenvalues in the selected cluster. To
* select a real eigenvalue w(j), SELECT(j) must be set to 1.
* To select a complex conjugate pair of eigenvalues
* w(j) and w(j+1), corresponding to a 2-by-2 diagonal block,
* either SELECT(j) or SELECT(j+1) or both must be set to 1;
* a complex conjugate pair of eigenvalues must be
* either both included in the cluster or both excluded.
* On output, the (partial) reordering is displayed.
*
* PARA (global input) INTEGER*6
* Block parameters (some should be replaced by calls to
* PILAENV and others by meaningful default values):
* PARA(1) = maximum number of concurrent computational windows
* allowed in the algorithm;
* 0 < PARA(1) <= min(NPROW,NPCOL) must hold;
* PARA(2) = number of eigenvalues in each window;
* 0 < PARA(2) < PARA(3) must hold;
* PARA(3) = window size; PARA(2) < PARA(3) < DESCT(MB_)
* must hold;
* PARA(4) = minimal percentage of flops required for
* performing matrix-matrix multiplications instead
* of pipelined orthogonal transformations;
* 0 <= PARA(4) <= 100 must hold;
* PARA(5) = width of block column slabs for row-wise
* application of pipelined orthogonal
* transformations in their factorized form;
* 0 < PARA(5) <= DESCT(MB_) must hold.
* PARA(6) = the maximum number of eigenvalues moved together
* over a process border; in practice, this will be
* approximately half of the cross border window size
* 0 < PARA(6) <= PARA(2) must hold;
*
* N (global input) INTEGER
* The order of the globally distributed matrix T. N >= 0.
*
* T (local input/output) REAL array,
* dimension (LLD_T,LOCc(N)).
* On entry, the local pieces of the global distributed
* upper quasi-triangular matrix T, in Schur form. On exit, T is
* overwritten by the local pieces of the reordered matrix T,
* again in Schur form, with the selected eigenvalues in the
* globally leading diagonal blocks.
*
* IT (global input) INTEGER
* JT (global input) INTEGER
* The row and column index in the global array T indicating the
* first column of sub( T ). IT = JT = 1 must hold.
*
* DESCT (global and local input) INTEGER array of dimension DLEN_.
* The array descriptor for the global distributed matrix T.
*
* Q (local input/output) REAL array,
* dimension (LLD_Q,LOCc(N)).
* On entry, if COMPQ = 'V', the local pieces of the global
* distributed matrix Q of Schur vectors.
* On exit, if COMPQ = 'V', Q has been postmultiplied by the
* global orthogonal transformation matrix which reorders T; the
* leading M columns of Q form an orthonormal basis for the
* specified invariant subspace.
* If COMPQ = 'N', Q is not referenced.
*
* IQ (global input) INTEGER
* JQ (global input) INTEGER
* The column index in the global array Q indicating the
* first column of sub( Q ). IQ = JQ = 1 must hold.
*
* DESCQ (global and local input) INTEGER array of dimension DLEN_.
* The array descriptor for the global distributed matrix Q.
*
* WR (global output) REAL array, dimension (N)
* WI (global output) REAL array, dimension (N)
* The real and imaginary parts, respectively, of the reordered
* eigenvalues of T. The eigenvalues are in principle stored in
* the same order as on the diagonal of T, with WR(i) = T(i,i)
* and, if T(i:i+1,i:i+1) is a 2-by-2 diagonal block, WI(i) > 0
* and WI(i+1) = -WI(i).
* Note also that if a complex eigenvalue is sufficiently
* ill-conditioned, then its value may differ significantly
* from its value before reordering.
*
* M (global output) INTEGER
* The dimension of the specified invariant subspace.
* 0 <= M <= N.
*
* WORK (local workspace/output) REAL array,
* dimension (LWORK)
* On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
*
* LWORK (local input) INTEGER
* The dimension of the array WORK.
*
* If LWORK = -1, then a workspace query is assumed; the routine
* only calculates the optimal size of the WORK array, returns
* this value as the first entry of the WORK array, and no error
* message related to LWORK is issued by PXERBLA.
*
* IWORK (local workspace/output) INTEGER array, dimension (LIWORK)
*
* LIWORK (local input) INTEGER
* The dimension of the array IWORK.
*
* If LIWORK = -1, then a workspace query is assumed; the
* routine only calculates the optimal size of the IWORK array,
* returns this value as the first entry of the IWORK array, and
* no error message related to LIWORK is issued by PXERBLA.
*
* INFO (global output) INTEGER
* = 0: successful exit
* < 0: if INFO = -i, the i-th argument had an illegal value.
* If the i-th argument is an array and the j-entry had
* an illegal value, then INFO = -(i*1000+j), if the i-th
* argument is a scalar and had an illegal value, then INFO = -i.
* > 0: here we have several possibilites
* *) Reordering of T failed because some eigenvalues are too
* close to separate (the problem is very ill-conditioned);
* T may have been partially reordered, and WR and WI
* contain the eigenvalues in the same order as in T.
* On exit, INFO = {the index of T where the swap failed}.
* *) A 2-by-2 block to be reordered split into two 1-by-1
* blocks and the second block failed to swap with an
* adjacent block.
* On exit, INFO = {the index of T where the swap failed}.
* *) If INFO = N+1, there is no valid BLACS context (see the
* BLACS documentation for details).
* In a future release this subroutine may distinguish between
* the case 1 and 2 above.
*
* Additional requirements
* =======================
*
* The following alignment requirements must hold:
* (a) DESCT( MB_ ) = DESCT( NB_ ) = DESCQ( MB_ ) = DESCQ( NB_ )
* (b) DESCT( RSRC_ ) = DESCQ( RSRC_ )
* (c) DESCT( CSRC_ ) = DESCQ( CSRC_ )
*
* All matrices must be blocked by a block factor larger than or
* equal to two (3). This is to simplify reordering across processor
* borders in the presence of 2-by-2 blocks.
*
* Limitations
* ===========
*
* This algorithm cannot work on submatrices of T and Q, i.e.,
* IT = JT = IQ = JQ = 1 must hold. This is however no limitation
* since PDLAHQR does not compute Schur forms of submatrices anyway.
*
* References
* ==========
*
* [1] Z. Bai and J. W. Demmel; On swapping diagonal blocks in real
* Schur form, Linear Algebra Appl., 186:73--95, 1993. Also as
* LAPACK Working Note 54.
*
* [2] D. Kressner; Block algorithms for reordering standard and
* generalized Schur forms, ACM TOMS, 32(4):521-532, 2006.
* Also LAPACK Working Note 171.
*
* [3] R. Granat, B. Kagstrom, and D. Kressner; Parallel eigenvalue
* reordering in real Schur form, Concurrency and Computations:
* Practice and Experience, 21(9):1225-1250, 2009. Also as
* LAPACK Working Note 192.
*
* Parallel execution recommendations
* ==================================
*
* Use a square grid, if possible, for maximum performance. The block
* parameters in PARA should be kept well below the data distribution
* block size. In particular, see [3] for recommended settings for
* these parameters.
*
* In general, the parallel algorithm strives to perform as much work
* as possible without crossing the block borders on the main block
* diagonal.
*
* Contributors
* ============
*
* Implemented by Robert Granat, Dept. of Computing Science and HPC2N,
* Umea University, Sweden, March 2007,
* in collaboration with Bo Kagstrom and Daniel Kressner.
* Modified by Meiyue Shao, October 2011.
*
* Revisions
* =========
*
* Please send bug-reports to [email protected]
*
* Keywords
* ========
*
* Real Schur form, eigenvalue reordering
*
* =====================================================================
* ..
* .. Parameters ..
CHARACTER TOP
INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
$ LLD_, MB_, M_, NB_, N_, RSRC_
REAL ZERO, ONE
PARAMETER ( TOP = '1-Tree',
$ BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
$ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
$ RSRC_ = 7, CSRC_ = 8, LLD_ = 9,
$ ZERO = 0.0, ONE = 1.0 )
* ..
* .. Local Scalars ..
LOGICAL LQUERY, PAIR, SWAP, WANTQ,
$ ISHH, FIRST, SKIP1CR, BORDER, LASTWAIT
INTEGER NPROW, NPCOL, MYROW, MYCOL, NB, NPROCS,
$ IERR, DIM1, INDX, LLDT, TRSRC, TCSRC, ILOC1,
$ JLOC1, MYIERR, ICTXT,
$ RSRC1, CSRC1, ILOC3, JLOC3, TRSRC3,
$ TCSRC3, ILOC, JLOC, TRSRC4, TCSRC4,
$ FLOPS, I, ILO, IHI, J, K, KK, KKS,
$ KS, LIWMIN, LWMIN, MMULT, N1, N2,
$ NCB, NDTRAF, NITRAF, NWIN, NUMWIN, PDTRAF,
$ PITRAF, PDW, WINEIG, WINSIZ, LLDQ,
$ RSRC, CSRC, ILILO, ILIHI, ILSEL, IRSRC,
$ ICSRC, IPIW, IPW1, IPW2, IPW3, TIHI, TILO,
$ LIHI, WINDOW, LILO, LSEL, BUFFER,
$ NMWIN2, BUFFLEN, LROWS, LCOLS, ILOC2, JLOC2,
$ WNEICR, WINDOW0, RSRC4, CSRC4, LIHI4, RSRC3,
$ CSRC3, RSRC2, CSRC2, LIHIC, LIHI1, ILEN4,
$ SELI4, ILEN1, DIM4, IPW4, QROWS, TROWS,
$ TCOLS, IPW5, IPW6, IPW7, IPW8, JLOC4,
$ EAST, WEST, ILOC4, SOUTH, NORTH, INDXS,
$ ITT, JTT, ILEN, DLEN, INDXE, TRSRC1, TCSRC1,
$ TRSRC2, TCSRC2, ILOS, DIR, TLIHI, TLILO, TLSEL,
$ ROUND, LAST, WIN0S, WIN0E, WINE
REAL ELEM, ELEM1, ELEM2, ELEM3, ELEM4, SN, CS, TMP,
$ ELEM5
* ..
* .. Local Arrays ..
INTEGER IBUFF( 8 ), IDUM1( 1 ), IDUM2( 1 ), MMAX( 1 ),
$ MMIN( 1 )
* ..
* .. External Functions ..
LOGICAL LSAME
INTEGER NUMROC, INDXG2P, INDXG2L
EXTERNAL LSAME, NUMROC, INDXG2P, INDXG2L
* ..
* .. External Subroutines ..
EXTERNAL PSLACPY, PXERBLA, PCHK1MAT, PCHK2MAT,
$ SGEMM, SLAMOV, ILACPY, CHK1MAT,
$ INFOG2L, DGSUM2D, SGESD2D, SGERV2D, SGEBS2D,
$ SGEBR2D, IGSUM2D, BLACS_GRIDINFO, IGEBS2D,
$ IGEBR2D, IGAMX2D, IGAMN2D, BSLAAPP, BDTREXC
* ..
* .. Intrinsic Functions ..
INTRINSIC ABS, MAX, SQRT, MIN
* ..
* .. Local Functions ..
INTEGER ICEIL
* ..
* .. Executable Statements ..
*
* Get grid parameters.
*
ICTXT = DESCT( CTXT_ )
CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
NPROCS = NPROW*NPCOL
*
* Test if grid is O.K., i.e., the context is valid.
*
INFO = 0
IF( NPROW.EQ.-1 ) THEN
INFO = N+1
END IF
*
* Check if workspace query.
*
LQUERY = LWORK.EQ.-1 .OR. LIWORK.EQ.-1
*
* Test dimensions for local sanity.
*
IF( INFO.EQ.0 ) THEN
CALL CHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, INFO )
END IF
IF( INFO.EQ.0 ) THEN
CALL CHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, INFO )
END IF
*
* Check the blocking sizes for alignment requirements.
*
IF( INFO.EQ.0 ) THEN
IF( DESCT( MB_ ).NE.DESCT( NB_ ) ) INFO = -(1000*9 + MB_)
END IF
IF( INFO.EQ.0 ) THEN
IF( DESCQ( MB_ ).NE.DESCQ( NB_ ) ) INFO = -(1000*13 + MB_)
END IF
IF( INFO.EQ.0 ) THEN
IF( DESCT( MB_ ).NE.DESCQ( MB_ ) ) INFO = -(1000*9 + MB_)
END IF
*
* Check the blocking sizes for minimum sizes.
*
IF( INFO.EQ.0 ) THEN
IF( N.NE.DESCT( MB_ ) .AND. DESCT( MB_ ).LT.3 )
$ INFO = -(1000*9 + MB_)
IF( N.NE.DESCQ( MB_ ) .AND. DESCQ( MB_ ).LT.3 )
$ INFO = -(1000*13 + MB_)
END IF
*
* Check parameters in PARA.
*
NB = DESCT( MB_ )
IF( INFO.EQ.0 ) THEN
IF( PARA(1).LT.1 .OR. PARA(1).GT.MIN(NPROW,NPCOL) )
$ INFO = -(1000 * 4 + 1)
IF( PARA(2).LT.1 .OR. PARA(2).GE.PARA(3) )
$ INFO = -(1000 * 4 + 2)
IF( PARA(3).LT.1 .OR. PARA(3).GT.NB )
$ INFO = -(1000 * 4 + 3)
IF( PARA(4).LT.0 .OR. PARA(4).GT.100 )
$ INFO = -(1000 * 4 + 4)
IF( PARA(5).LT.1 .OR. PARA(5).GT.NB )
$ INFO = -(1000 * 4 + 5)
IF( PARA(6).LT.1 .OR. PARA(6).GT.PARA(2) )
$ INFO = -(1000 * 4 + 6)
END IF
*
* Check requirements on IT, JT, IQ and JQ.
*
IF( INFO.EQ.0 ) THEN
IF( IT.NE.1 ) INFO = -6
IF( JT.NE.IT ) INFO = -7
IF( IQ.NE.1 ) INFO = -10
IF( JQ.NE.IQ ) INFO = -11
END IF
*
* Test input parameters for global sanity.
*
IF( INFO.EQ.0 ) THEN
CALL PCHK1MAT( N, 5, N, 5, IT, JT, DESCT, 9, 0, IDUM1,
$ IDUM2, INFO )
END IF
IF( INFO.EQ.0 ) THEN
CALL PCHK1MAT( N, 5, N, 5, IQ, JQ, DESCQ, 13, 0, IDUM1,
$ IDUM2, INFO )
END IF
IF( INFO.EQ.0 ) THEN
CALL PCHK2MAT( N, 5, N, 5, IT, JT, DESCT, 9, N, 5, N, 5,
$ IQ, JQ, DESCQ, 13, 0, IDUM1, IDUM2, INFO )
END IF
*
* Decode and test the input parameters.
*
IF( INFO.EQ.0 .OR. LQUERY ) THEN
*
WANTQ = LSAME( COMPQ, 'V' )
IF( N.LT.0 ) THEN
INFO = -4
ELSE
*
* Extract local leading dimension.
*
LLDT = DESCT( LLD_ )
LLDQ = DESCQ( LLD_ )
*
* Check the SELECT vector for consistency and set M to the
* dimension of the specified invariant subspace.
*
M = 0
DO 10 K = 1, N
IF( K.LT.N ) THEN
CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL,
$ MYROW, MYCOL, ITT, JTT, TRSRC, TCSRC )
IF( MYROW.EQ.TRSRC .AND. MYCOL.EQ.TCSRC ) THEN
ELEM = T( (JTT-1)*LLDT + ITT )
IF( ELEM.NE.ZERO ) THEN
IF( SELECT(K).NE.0 .AND.
$ SELECT(K+1).EQ.0 ) THEN
* INFO = -2
SELECT(K+1) = 1
ELSEIF( SELECT(K).EQ.0 .AND.
$ SELECT(K+1).NE.0 ) THEN
* INFO = -2
SELECT(K) = 1
END IF
END IF
END IF
END IF
IF( SELECT(K).NE.0 ) M = M + 1
10 CONTINUE
MMAX( 1 ) = M
MMIN( 1 ) = M
IF( NPROCS.GT.1 )
$ CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, MMAX( 1 ), 1, -1,
$ -1, -1, -1, -1 )
IF( NPROCS.GT.1 )
$ CALL IGAMN2D( ICTXT, 'All', TOP, 1, 1, MMIN( 1 ), 1, -1,
$ -1, -1, -1, -1 )
IF( MMAX( 1 ).GT.MMIN( 1 ) ) THEN
M = MMAX( 1 )
IF( NPROCS.GT.1 )
$ CALL IGAMX2D( ICTXT, 'All', TOP, N, 1, SELECT, N,
$ -1, -1, -1, -1, -1 )
END IF
*
* Compute needed workspace.
*
N1 = M
N2 = N - M
*
TROWS = NUMROC( N, NB, MYROW, DESCT(RSRC_), NPROW )
TCOLS = NUMROC( N, NB, MYCOL, DESCT(CSRC_), NPCOL )
LWMIN = N + 7*NB**2 + 2*TROWS*PARA( 3 ) + TCOLS*PARA( 3 ) +
$ MAX( TROWS*PARA( 3 ), TCOLS*PARA( 3 ) )
LIWMIN = 5*PARA( 1 ) + PARA( 2 )*PARA( 3 ) -
$ PARA( 2 ) * ( PARA( 2 ) + 1 ) / 2
*
IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
INFO = -17
ELSE IF( LIWORK.LT.LIWMIN .AND. .NOT.LQUERY ) THEN
INFO = -19
END IF
END IF
END IF
*
* Global maximum on info.
*
IF( NPROCS.GT.1 ) THEN
CALL IGAMX2D( ICTXT, 'All', TOP, 1, 1, INFO, 1, -1, -1,
$ -1, -1, -1 )
END IF
*
* Return if some argument is incorrect.
*
IF( INFO.NE.0 .AND. .NOT.LQUERY ) THEN
M = 0
CALL PXERBLA( ICTXT, 'PSTRORD', -INFO )
RETURN
ELSEIF( LQUERY ) THEN
WORK( 1 ) = FLOAT(LWMIN)
IWORK( 1 ) = LIWMIN
RETURN
END IF
*
* Quick return if possible.
*
IF( M.EQ.N .OR. M.EQ.0 ) GO TO 545
*
* Set parameters.
*
NUMWIN = PARA( 1 )
WINEIG = MAX( PARA( 2 ), 2 )
WINSIZ = MIN( MAX( PARA( 3 ), PARA( 2 )*2 ), NB )
MMULT = PARA( 4 )
NCB = PARA( 5 )
WNEICR = PARA( 6 )
*
* Insert some pointers into INTEGER workspace.
*
* Information about all the active windows is stored
* in IWORK( 1:5*NUMWIN ). Each processor has a copy.
* LILO: start position
* LIHI: stop position
* LSEL: number of selected eigenvalues
* RSRC: processor id (row)
* CSRC: processor id (col)
* IWORK( IPIW+ ) contain information of orthogonal transformations.
*
ILILO = 1
ILIHI = ILILO + NUMWIN
ILSEL = ILIHI + NUMWIN
IRSRC = ILSEL + NUMWIN
ICSRC = IRSRC + NUMWIN
IPIW = ICSRC + NUMWIN
*
* Insert some pointers into REAL workspace - for now we
* only need two pointers.
*
IPW1 = 1
IPW2 = IPW1 + NB
*
* Collect the selected blocks at the top-left corner of T.
*
* Globally: ignore eigenvalues that are already in order.
* ILO is a global variable and is kept updated to be consistent
* throughout the process mesh.
*
ILO = 0
40 CONTINUE
ILO = ILO + 1
IF( ILO.LE.N ) THEN
IF( SELECT(ILO).NE.0 ) GO TO 40
END IF
*
* Globally: start the collection at the top of the matrix. Here,
* IHI is a global variable and is kept updated to be consistent
* throughout the process mesh.
*
IHI = N
*
* Globally: While ( ILO <= M ) do
50 CONTINUE
*
IF( ILO.LE.M ) THEN
*
* Depending on the value of ILO, find the diagonal block index J,
* such that T(1+(J-1)*NB:1+J*NB,1+(J-1)*NB:1+J*NB) contains the
* first unsorted eigenvalue. Check that J does not point to a
* block with only one selected eigenvalue in the last position
* which belongs to a splitted 2-by-2 block.
*
ILOS = ILO - 1
52 CONTINUE
ILOS = ILOS + 1
IF( SELECT(ILOS).EQ.0 ) GO TO 52
IF( ILOS.LT.N ) THEN
IF( SELECT(ILOS+1).NE.0 .AND. MOD(ILOS,NB).EQ.0 ) THEN
CALL PSELGET( 'All', TOP, ELEM, T, ILOS+1, ILOS, DESCT )
IF( ELEM.NE.ZERO ) GO TO 52
END IF
END IF
J = ICEIL(ILOS,NB)
*
* Globally: Set start values of LILO and LIHI for all processes.
* Choose also the number of selected eigenvalues at top of each
* diagonal block such that the number of eigenvalues which remain
* to be reordered is an integer multiple of WINEIG.
*
* All the information is saved into the INTEGER workspace such
* that all processors are aware of each others operations.
*
* Compute the number of concurrent windows.
*
NMWIN2 = (ICEIL(IHI,NB)*NB - (ILO-MOD(ILO,NB)+1)+1) / NB
NMWIN2 = MIN( MIN( NUMWIN, NMWIN2 ), ICEIL(N,NB) - J + 1 )
*
* For all windows, set LSEL = 0 and find a proper start value of
* LILO such that LILO points at the first non-selected entry in
* the corresponding diagonal block of T.
*
DO 80 K = 1, NMWIN2
IWORK( ILSEL+K-1) = 0
IWORK( ILILO+K-1) = MAX( ILO, (J-1)*NB+(K-1)*NB+1 )
LILO = IWORK( ILILO+K-1 )
82 CONTINUE
IF( SELECT(LILO).NE.0 .AND. LILO.LT.(J+K-1)*NB ) THEN
LILO = LILO + 1
IF( LILO.LE.N ) GO TO 82
END IF
IWORK( ILILO+K-1 ) = LILO
*
* Fix each LILO to ensure that no 2-by-2 block is cut in top
* of the submatrix (LILO:LIHI,LILO:LIHI).
*
LILO = IWORK(ILILO+K-1)
IF( LILO.GT.NB ) THEN
CALL PSELGET( 'All', TOP, ELEM, T, LILO, LILO-1, DESCT )
IF( ELEM.NE.ZERO ) THEN
IF( LILO.LT.(J+K-1)*NB ) THEN
IWORK(ILILO+K-1) = IWORK(ILILO+K-1) + 1
ELSE
IWORK(ILILO+K-1) = IWORK(ILILO+K-1) - 1
END IF
END IF
END IF
*
* Set a proper LIHI value for each window. Also find the
* processors corresponding to the corresponding windows.
*
IWORK( ILIHI+K-1 ) = IWORK( ILILO+K-1 )
IWORK( IRSRC+K-1 ) = INDXG2P( IWORK(ILILO+K-1), NB, MYROW,
$ DESCT( RSRC_ ), NPROW )
IWORK( ICSRC+K-1 ) = INDXG2P( IWORK(ILILO+K-1), NB, MYCOL,
$ DESCT( CSRC_ ), NPCOL )
TILO = IWORK(ILILO+K-1)
TIHI = MIN( N, ICEIL( TILO, NB ) * NB )
DO 90 KK = TIHI, TILO, -1
IF( SELECT(KK).NE.0 ) THEN
IWORK(ILIHI+K-1) = MAX(IWORK(ILIHI+K-1) , KK )
IWORK(ILSEL+K-1) = IWORK(ILSEL+K-1) + 1
IF( IWORK(ILSEL+K-1).GT.WINEIG ) THEN
IWORK(ILIHI+K-1) = KK
IWORK(ILSEL+K-1) = 1
END IF
END IF
90 CONTINUE
*
* Fix each LIHI to avoid that bottom of window cuts 2-by-2
* block. We exclude such a block if located on block (process)
* border and on window border or if an inclusion would cause
* violation on the maximum number of eigenvalues to reorder
* inside each window. If only on window border, we include it.
* The excluded block is included automatically later when a
* subcluster is reordered into the block from South-East.
*
LIHI = IWORK(ILIHI+K-1)
IF( LIHI.LT.N ) THEN
CALL PSELGET( 'All', TOP, ELEM, T, LIHI+1, LIHI, DESCT )
IF( ELEM.NE.ZERO ) THEN
IF( ICEIL( LIHI, NB ) .NE. ICEIL( LIHI+1, NB ) .OR.
$ IWORK( ILSEL+K-1 ).EQ.WINEIG ) THEN
IWORK( ILIHI+K-1 ) = IWORK( ILIHI+K-1 ) - 1
IF( IWORK( ILSEL+K-1 ).GT.2 )
$ IWORK( ILSEL+K-1 ) = IWORK( ILSEL+K-1 ) - 1
ELSE
IWORK( ILIHI+K-1 ) = IWORK( ILIHI+K-1 ) + 1
IF( SELECT(LIHI+1).NE.0 )
$ IWORK( ILSEL+K-1 ) = IWORK( ILSEL+K-1 ) + 1
END IF
END IF
END IF
80 CONTINUE
*
* Fix the special cases of LSEL = 0 and LILO = LIHI for each
* window by assuring that the stop-condition for local reordering
* is fulfilled directly. Do this by setting LIHI = startposition
* for the corresponding block and LILO = LIHI + 1.
*
DO 85 K = 1, NMWIN2
LILO = IWORK( ILILO + K - 1 )
LIHI = IWORK( ILIHI + K - 1 )
LSEL = IWORK( ILSEL + K - 1 )
IF( LSEL.EQ.0 .OR. LILO.EQ.LIHI ) THEN
LIHI = IWORK( ILIHI + K - 1 )
IWORK( ILIHI + K - 1 ) = (ICEIL(LIHI,NB)-1)*NB + 1
IWORK( ILILO + K - 1 ) = IWORK( ILIHI + K - 1 ) + 1
END IF
85 CONTINUE
*
* Associate all processors with the first computational window
* that should be activated, if possible.
*
LILO = IHI
LIHI = ILO
LSEL = M
FIRST = .TRUE.
DO 95 WINDOW = 1, NMWIN2
RSRC = IWORK(IRSRC+WINDOW-1)
CSRC = IWORK(ICSRC+WINDOW-1)
IF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN
TLILO = IWORK( ILILO + WINDOW - 1 )
TLIHI = IWORK( ILIHI + WINDOW - 1 )
TLSEL = IWORK( ILSEL + WINDOW - 1 )
IF( (.NOT. ( LIHI .GE. LILO + LSEL ) ) .AND.
$ ( (TLIHI .GE. TLILO + TLSEL) .OR. FIRST ) ) THEN
IF( FIRST ) FIRST = .FALSE.
LILO = TLILO
LIHI = TLIHI
LSEL = TLSEL
GO TO 97
END IF
END IF
95 CONTINUE
97 CONTINUE
*
* Exclude all processors that are not involved in any
* computational window right now.
*
IERR = 0
IF( LILO.EQ.IHI .AND. LIHI.EQ.ILO .AND. LSEL.EQ.M )
$ GO TO 114
*
* Make sure all processors associated with a compuational window
* enter the local reordering the first time.
*
FIRST = .TRUE.
*
* Globally for all computational windows:
* While ( LIHI >= LILO + LSEL ) do
ROUND = 1
130 CONTINUE
IF( FIRST .OR. ( LIHI .GE. LILO + LSEL ) ) THEN
*
* Perform computations in parallel: loop through all
* compuational windows, do local reordering and accumulate
* transformations, broadcast them in the corresponding block
* row and columns and compute the corresponding updates.
*
DO 110 WINDOW = 1, NMWIN2
RSRC = IWORK(IRSRC+WINDOW-1)
CSRC = IWORK(ICSRC+WINDOW-1)
*
* The process on the block diagonal computes the
* reordering.
*
IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN
LILO = IWORK(ILILO+WINDOW-1)
LIHI = IWORK(ILIHI+WINDOW-1)
LSEL = IWORK(ILSEL+WINDOW-1)
*
* Compute the local value of I -- start position.
*
I = MAX( LILO, LIHI - WINSIZ + 1 )
*
* Fix my I to avoid that top of window cuts a 2-by-2
* block.
*
IF( I.GT.LILO ) THEN
CALL INFOG2L( I, I-1, DESCT, NPROW, NPCOL, MYROW,
$ MYCOL, ILOC, JLOC, RSRC, CSRC )
IF( T( LLDT*(JLOC-1) + ILOC ).NE.ZERO )
$ I = I + 1
END IF
*
* Compute local indicies for submatrix to operate on.
*
CALL INFOG2L( I, I, DESCT, NPROW, NPCOL,
$ MYROW, MYCOL, ILOC1, JLOC1, RSRC, CSRC )
*
* The active window is ( I:LIHI, I:LIHI ). Reorder
* eigenvalues within this window and pipeline
* transformations.
*
NWIN = LIHI - I + 1
KS = 0
PITRAF = IPIW
PDTRAF = IPW2
*
PAIR = .FALSE.
DO 140 K = I, LIHI
IF( PAIR ) THEN
PAIR = .FALSE.
ELSE
SWAP = SELECT( K ).NE.0
IF( K.LT.LIHI ) THEN
CALL INFOG2L( K+1, K, DESCT, NPROW, NPCOL,
$ MYROW, MYCOL, ILOC, JLOC, RSRC, CSRC )
IF( T( LLDT*(JLOC-1) + ILOC ).NE.ZERO )
$ PAIR = .TRUE.
END IF
IF( SWAP ) THEN
KS = KS + 1
*
* Swap the K-th block to position I+KS-1.
*
IERR = 0
KK = K - I + 1
KKS = KS
IF( KK.NE.KS ) THEN
NITRAF = LIWORK - PITRAF + 1
NDTRAF = LWORK - PDTRAF + 1
CALL BSTREXC( NWIN,
$ T(LLDT*(JLOC1-1) + ILOC1), LLDT, KK,
$ KKS, NITRAF, IWORK( PITRAF ), NDTRAF,
$ WORK( PDTRAF ), WORK(IPW1), IERR )
PITRAF = PITRAF + NITRAF
PDTRAF = PDTRAF + NDTRAF
*
* Update array SELECT.
*
IF ( PAIR ) THEN
DO 150 J = I+KK-1, I+KKS, -1
SELECT(J+1) = SELECT(J-1)
150 CONTINUE
SELECT(I+KKS-1) = 1
SELECT(I+KKS) = 1
ELSE
DO 160 J = I+KK-1, I+KKS, -1
SELECT(J) = SELECT(J-1)
160 CONTINUE
SELECT(I+KKS-1) = 1
END IF
*
IF ( IERR.EQ.1 .OR. IERR.EQ.2 ) THEN
*
* Some blocks are too close to swap:
* prepare to leave in a clean fashion. If
* IERR.EQ.2, we must update SELECT to
* account for the fact that the 2 by 2
* block to be reordered did split and the
* first part of this block is already
* reordered.
*
IF ( IERR.EQ.2 ) THEN
SELECT( I+KKS-3 ) = 1
SELECT( I+KKS-1 ) = 0
KKS = KKS + 1
END IF
*
* Update off-diagonal blocks immediately.
*
GO TO 170
END IF
KS = KKS
END IF
IF( PAIR )
$ KS = KS + 1
END IF
END IF
140 CONTINUE
END IF
110 CONTINUE
170 CONTINUE
*
* The on-diagonal processes save their information from the
* local reordering in the integer buffer. This buffer is
* broadcasted to updating processors, see below.
*
DO 175 WINDOW = 1, NMWIN2
RSRC = IWORK(IRSRC+WINDOW-1)
CSRC = IWORK(ICSRC+WINDOW-1)
IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN
IBUFF( 1 ) = I
IBUFF( 2 ) = NWIN
IBUFF( 3 ) = PITRAF
IBUFF( 4 ) = KS
IBUFF( 5 ) = PDTRAF
IBUFF( 6 ) = NDTRAF
ILEN = PITRAF - IPIW
DLEN = PDTRAF - IPW2
IBUFF( 7 ) = ILEN
IBUFF( 8 ) = DLEN
END IF
175 CONTINUE
*
* For the updates with respect to the local reordering, we
* organize the updates in two phases where the update
* "direction" (controlled by the DIR variable below) is first
* chosen to be the corresponding rows, then the corresponding
* columns.
*
DO 1111 DIR = 1, 2
*
* Broadcast information about the reordering and the
* accumulated transformations: I, NWIN, PITRAF, NITRAF,
* PDTRAF, NDTRAF. If no broadcast is performed, use an
* artificial value of KS to prevent updating indicies for
* windows already finished (use KS = -1).
*
DO 111 WINDOW = 1, NMWIN2
RSRC = IWORK(IRSRC+WINDOW-1)
CSRC = IWORK(ICSRC+WINDOW-1)
IF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN
LILO = IWORK(ILILO+WINDOW-1)
LIHI = IWORK(ILIHI+WINDOW-1)
LSEL = IWORK(ILSEL+WINDOW-1)
END IF
IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN
IF( NPCOL.GT.1 .AND. DIR.EQ.1 )
$ CALL IGEBS2D( ICTXT, 'Row', TOP, 8, 1, IBUFF, 8 )
IF( NPROW.GT.1 .AND. DIR.EQ.2 )
$ CALL IGEBS2D( ICTXT, 'Col', TOP, 8, 1, IBUFF, 8 )
ELSEIF( MYROW.EQ.RSRC .OR. MYCOL.EQ.CSRC ) THEN
IF( NPCOL.GT.1 .AND. DIR.EQ.1 .AND. MYROW.EQ.RSRC )
$ THEN
IF( FIRST .OR. (LIHI .GE. LILO + LSEL) ) THEN
CALL IGEBR2D( ICTXT, 'Row', TOP, 8, 1, IBUFF, 8,
$ RSRC, CSRC )
I = IBUFF( 1 )
NWIN = IBUFF( 2 )
PITRAF = IBUFF( 3 )
KS = IBUFF( 4 )
PDTRAF = IBUFF( 5 )
NDTRAF = IBUFF( 6 )
ILEN = IBUFF( 7 )
DLEN = IBUFF( 8 )
ELSE
ILEN = 0
DLEN = 0
KS = -1
END IF
END IF
IF( NPROW.GT.1 .AND. DIR.EQ.2 .AND. MYCOL.EQ.CSRC )
$ THEN
IF( FIRST .OR. (LIHI .GE. LILO + LSEL) ) THEN
CALL IGEBR2D( ICTXT, 'Col', TOP, 8, 1, IBUFF, 8,
$ RSRC, CSRC )
I = IBUFF( 1 )
NWIN = IBUFF( 2 )
PITRAF = IBUFF( 3 )
KS = IBUFF( 4 )
PDTRAF = IBUFF( 5 )
NDTRAF = IBUFF( 6 )
ILEN = IBUFF( 7 )
DLEN = IBUFF( 8 )
ELSE
ILEN = 0
DLEN = 0
KS = -1
END IF
END IF
END IF
*
* Broadcast the accumulated transformations - copy all
* information from IWORK(IPIW:PITRAF-1) and
* WORK(IPW2:PDTRAF-1) to a buffer and broadcast this
* buffer in the corresponding block row and column. On
* arrival, copy the information back to the correct part of
* the workspace. This step is avoided if no computations
* were performed at the diagonal processor, i.e.,
* BUFFLEN = 0.
*
IF( MYROW.EQ.RSRC .AND. MYCOL.EQ.CSRC ) THEN
BUFFER = PDTRAF
BUFFLEN = DLEN + ILEN
IF( BUFFLEN.NE.0 ) THEN
DO 180 INDX = 1, ILEN
WORK( BUFFER+INDX-1 ) =
$ FLOAT( IWORK(IPIW+INDX-1) )
180 CONTINUE
CALL SLAMOV( 'All', DLEN, 1, WORK( IPW2 ),
$ DLEN, WORK(BUFFER+ILEN), DLEN )
IF( NPCOL.GT.1 .AND. DIR.EQ.1 ) THEN
CALL SGEBS2D( ICTXT, 'Row', TOP, BUFFLEN, 1,