@@ -594,10 +594,29 @@ public void run() {
594
594
Map <HboxContainerId , String > containersAppFinishTime =
595
595
applicationContext .getContainersAppFinishTime ();
596
596
597
+ // container info schema (since 1.3a b30b967d7c283e8ac542b66413fcc2bc5c5fb95c):
598
+ // 0: node url
599
+ // 1: gpu device id
600
+ // 2: role
601
+ // 3: status
602
+ // 4: cpu metrics
603
+ // 5: gpu mem metrics
604
+ // 6: gpu util metrics
605
+ // 7: start time
606
+ // 8: finish time
607
+ // 9: percent progress
608
+ // 10: log url
609
+ // 11: stats - cpu
610
+ // 12: stats - gpu mem
611
+ // 13: stats - gpu utils
612
+ // 14: stats - mem usage warn (if cpuStatistics.size > 0 || version >= 1.9.2)
613
+ // 15: rank (since 1.9.2)
614
+ int workerIdx = 0 ;
597
615
for (Container container : workerContainers ) {
598
616
List <String > containerMessage = new ArrayList <>();
599
617
containerMessage .add (container .getNodeHttpAddress ());
600
618
HboxContainerId currentContainerID = new HboxContainerId (container .getId ());
619
+ String rank = "-" ;
601
620
if (applicationContext .getContainerGPUDevice (currentContainerID ) != null ) {
602
621
if (applicationContext
603
622
.getContainerGPUDevice (currentContainerID )
@@ -617,6 +636,7 @@ public void run() {
617
636
containerMessage .add (HboxConstants .CHIEF );
618
637
} else {
619
638
containerMessage .add (HboxConstants .WORKER );
639
+ rank = "" + workerIdx ++;
620
640
}
621
641
622
642
HboxContainerStatus status = applicationContext .getContainerStatus (currentContainerID );
@@ -677,6 +697,8 @@ public void run() {
677
697
} else {
678
698
usageStatistics .add ("false" );
679
699
}
700
+ } else {
701
+ usageStatistics .add ("-" ); // container info schema idx=14
680
702
}
681
703
682
704
if (containersAppStartTime .get (currentContainerID ) != null
@@ -729,13 +751,16 @@ public void run() {
729
751
container .getId ().toString (),
730
752
userName ));
731
753
containerMessage .addAll (usageStatistics );
754
+ containerMessage .add (rank ); // container info schema idx=15
732
755
logMessage .put (container .getId ().toString (), containerMessage );
733
756
}
734
757
758
+ int psIdx = 0 ;
735
759
for (Container container : psContainers ) {
736
760
List <String > containerMessage = new ArrayList <>();
737
761
containerMessage .add (container .getNodeHttpAddress ());
738
762
HboxContainerId currentContainerID = new HboxContainerId (container .getId ());
763
+ String rank = "-" ;
739
764
if (applicationContext .getContainerGPUDevice (currentContainerID ) != null ) {
740
765
if (applicationContext
741
766
.getContainerGPUDevice (currentContainerID )
@@ -750,17 +775,22 @@ public void run() {
750
775
containerMessage .add ("-" );
751
776
}
752
777
if (hboxAppType .equals ("TENSORFLOW" ) || "TENSOR2TENSOR" .equals (hboxAppType )) {
753
- containerMessage .add ("ps" );
778
+ containerMessage .add (HboxConstants .PS );
779
+ rank = "" + psIdx ++;
754
780
} else if (hboxAppType .equals ("MXNET" )
755
781
|| hboxAppType .equals ("DISTLIGHTLDA" )
756
782
|| hboxAppType .equals ("XFLOW" )) {
757
- containerMessage .add ("server" );
783
+ containerMessage .add (HboxConstants . SERVER );
758
784
} else if (hboxAppType .equals ("XDL" )) {
759
785
if (currentContainerID .toString ().equals (schedulerContainerId )) {
760
786
containerMessage .add (HboxConstants .SCHEDULER );
761
787
} else {
762
- containerMessage .add ("ps" );
788
+ containerMessage .add (HboxConstants .PS );
789
+ rank = "" + psIdx ++;
763
790
}
791
+ } else {
792
+ containerMessage .add (HboxConstants .PS );
793
+ rank = "" + psIdx ++;
764
794
}
765
795
HboxContainerStatus status = applicationContext .getContainerStatus (currentContainerID );
766
796
if (status != null ) {
@@ -814,6 +844,8 @@ public void run() {
814
844
} else {
815
845
usageStatistics .add ("false" );
816
846
}
847
+ } else {
848
+ usageStatistics .add ("-" ); // container info schema idx=14
817
849
}
818
850
819
851
if (containersAppStartTime .get (currentContainerID ) != null
@@ -841,6 +873,7 @@ public void run() {
841
873
container .getId ().toString (),
842
874
userName ));
843
875
containerMessage .addAll (usageStatistics );
876
+ containerMessage .add (rank ); // container info schema idx=15
844
877
logMessage .put (container .getId ().toString (), containerMessage );
845
878
}
846
879
0 commit comments