| 
							 | 
						from transformers import BertConfig | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						class BertConfig(BertConfig): | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def __init__( | 
					
					
						
						| 
							 | 
						        self, | 
					
					
						
						| 
							 | 
						        alibi_starting_size: int = 512, | 
					
					
						
						| 
							 | 
						        attention_probs_dropout_prob: float = 0.0, | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        use_glu_mlp: bool = True, | 
					
					
						
						| 
							 | 
						        use_monarch_mlp: bool = False, | 
					
					
						
						| 
							 | 
						        monarch_mlp_nblocks: int = 4, | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        use_positional_encodings: bool = False, | 
					
					
						
						| 
							 | 
						        max_position_embeddings: int = 512, | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        residual_long_conv: bool = False, | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        bidirectional: bool = True, | 
					
					
						
						| 
							 | 
						        hyena_w_mod: int = 1, | 
					
					
						
						| 
							 | 
						        hyena_filter_dropout: float = 0.2, | 
					
					
						
						| 
							 | 
						        hyena_filter_order: int = 64, | 
					
					
						
						| 
							 | 
						        hyena_training_additions: bool = False, | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        use_flash_mm: bool = False, | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        pool_all: bool = False, | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						        **kwargs, | 
					
					
						
						| 
							 | 
						    ): | 
					
					
						
						| 
							 | 
						        """Configuration class for MosaicBert. | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						        Args: | 
					
					
						
						| 
							 | 
						            alibi_starting_size (int): Use `alibi_starting_size` to determine how large of an alibi tensor to | 
					
					
						
						| 
							 | 
						                create when initializing the model. You should be able to ignore this parameter in most cases. | 
					
					
						
						| 
							 | 
						                Defaults to 512. | 
					
					
						
						| 
							 | 
						            attention_probs_dropout_prob (float): By default, turn off attention dropout in Mosaic BERT. | 
					
					
						
						| 
							 | 
						                Defaults to 0.0. | 
					
					
						
						| 
							 | 
						        """ | 
					
					
						
						| 
							 | 
						        super().__init__( | 
					
					
						
						| 
							 | 
						            attention_probs_dropout_prob=attention_probs_dropout_prob, **kwargs) | 
					
					
						
						| 
							 | 
						        self.alibi_starting_size = alibi_starting_size | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        self.use_glu_mlp = use_glu_mlp | 
					
					
						
						| 
							 | 
						        self.use_monarch_mlp = use_monarch_mlp | 
					
					
						
						| 
							 | 
						        self.monarch_mlp_nblocks = monarch_mlp_nblocks | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        self.use_positional_encodings = use_positional_encodings | 
					
					
						
						| 
							 | 
						        self.max_position_embeddings = max_position_embeddings | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        self.residual_long_conv = residual_long_conv | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        self.bidirectional = bidirectional | 
					
					
						
						| 
							 | 
						        self.hyena_w_mod = hyena_w_mod | 
					
					
						
						| 
							 | 
						        self.hyena_filter_dropout = hyena_filter_dropout | 
					
					
						
						| 
							 | 
						        self.hyena_filter_order = hyena_filter_order | 
					
					
						
						| 
							 | 
						        self.hyena_training_additions = hyena_training_additions | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        self.use_flash_mm = use_flash_mm | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        self.pool_all = pool_all | 
					
					
						
						| 
							 | 
						         |