Add Megatron bert

hyunwoongko · hyunwoongko · commit 9ab4c21fad06 · 2021-12-06T08:53:37.000+09:00
diff --git a/parallelformers/policies/base/auto.py b/parallelformers/policies/base/auto.py
@@ -18,7 +18,6 @@
 from torch import nn
 
 from parallelformers.policies.base import Policy
-from parallelformers.policies.gptj import GPTJPolicy
 
 
 class AutoPolicy:
@@ -641,11 +640,20 @@ def __init__(self):
 
         with suppress(Exception):
             from transformers.models.gptj.modeling_gptj import GPTJPreTrainedModel
+            from parallelformers.policies.gptj import GPTJPolicy
 
             self.builtin_policies[GPTJPreTrainedModel] = [
                 GPTJPolicy,
             ]
 
+        with suppress(Exception):
+            from transformers.models.megatron_bert import MegatronBertPreTrainedModel
+            from parallelformers.policies.megtron_bert import MegatronBertPolicy
+
+            self.builtin_policies[MegatronBertPreTrainedModel] = [
+                MegatronBertPolicy,
+            ]
+
     def get_policy(self, model: nn.Module) -> Union[List[Policy], None]:
         """
         Find appropriate policies for the current model
diff --git a/parallelformers/policies/megtron_bert.py b/parallelformers/policies/megtron_bert.py
@@ -0,0 +1,112 @@
+# Copyright 2021 TUNiB inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.models.megatron_bert.modeling_megatron_bert import \
+    MegatronBertLayer
+
+from parallelformers.policies.base import Layer, Policy
+from parallelformers.transformers.modeling_bert import BertEmbeddings_
+from parallelformers.utils import AllReduceLinear
+
+
+class MegatronBertPolicy(Policy):
+    @staticmethod
+    def replace_arguments(config, world_size):
+        return {
+            # 1. reduce hidden size
+            "attention.self.all_head_size": config.hidden_size // world_size,
+            "crossattention.self.all_head_size": config.hidden_size // world_size,
+            # 2. reduce number of heads
+            "attention.self.num_attention_heads": config.num_attention_heads
+            // world_size,
+            "crossattention.self.num_attention_heads": config.num_attention_heads
+            // world_size,
+        }
+
+    @staticmethod
+    def replace_modules():
+        return {
+            "BertEmbeddings": BertEmbeddings_,
+        }
+
+    @staticmethod
+    def attn_qkv():
+        return [
+            Layer(
+                weight="attention.self.query.weight",
+                bias="attention.self.query.bias",
+            ),
+            Layer(
+                weight="attention.self.key.weight",
+                bias="attention.self.key.bias",
+            ),
+            Layer(
+                weight="attention.self.value.weight",
+                bias="attention.self.value.bias",
+            ),
+            Layer(
+                weight="crossattention.self.query.weight",
+                bias="crossattention.self.query.bias",
+                ignore_checker=True,
+            ),
+            Layer(
+                weight="crossattention.self.key.weight",
+                bias="crossattention.self.key.bias",
+                ignore_checker=True,
+            ),
+            Layer(
+                weight="crossattention.self.value.weight",
+                bias="crossattention.self.value.bias",
+                ignore_checker=True,
+            ),
+        ]
+
+    @staticmethod
+    def attn_out():
+        return [
+            Layer(
+                weight="attention.output.dense.weight",
+                bias="attention.output.dense.bias",
+                replace=AllReduceLinear,
+            ),
+            Layer(
+                weight="crossattention.output.dense.weight",
+                bias="crossattention.output.dense.bias",
+                replace=AllReduceLinear,
+                ignore_checker=True,
+            ),
+        ]
+
+    @staticmethod
+    def mlp_in():
+        return [
+            Layer(
+                weight="intermediate.dense.weight",
+                bias="intermediate.dense.bias",
+            ),
+        ]
+
+    @staticmethod
+    def mlp_out():
+        return [
+            Layer(
+                weight="output.dense.weight",
+                bias="output.dense.bias",
+                replace=AllReduceLinear,
+            ),
+        ]
+
+    @staticmethod
+    def original_layer_class():
+        return MegatronBertLayer