Upload SegmentNT
Browse files- config.json +16 -1
- modeling_segment_nt.py +3 -2
- pytorch_model.bin +1 -1
- segment_nt_config.py +3 -3
config.json
CHANGED
|
@@ -10,6 +10,22 @@
|
|
| 10 |
},
|
| 11 |
"emb_layer_norm_before": false,
|
| 12 |
"esmfold_config": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"hidden_dropout_prob": 0.0,
|
| 14 |
"hidden_size": 1024,
|
| 15 |
"initializer_range": 0.02,
|
|
@@ -20,7 +36,6 @@
|
|
| 20 |
"max_position_embeddings": 2050,
|
| 21 |
"model_type": "esm",
|
| 22 |
"num_attention_heads": 16,
|
| 23 |
-
"num_features": 14,
|
| 24 |
"num_hidden_layers": 29,
|
| 25 |
"num_layers_head": 2,
|
| 26 |
"pad_token_id": 1,
|
|
|
|
| 10 |
},
|
| 11 |
"emb_layer_norm_before": false,
|
| 12 |
"esmfold_config": null,
|
| 13 |
+
"features": [
|
| 14 |
+
"protein_coding_gene",
|
| 15 |
+
"lncRNA",
|
| 16 |
+
"exon",
|
| 17 |
+
"intron",
|
| 18 |
+
"splice_donor",
|
| 19 |
+
"splice_acceptor",
|
| 20 |
+
"5UTR",
|
| 21 |
+
"3UTR",
|
| 22 |
+
"CTCF-bound",
|
| 23 |
+
"polyA_signal",
|
| 24 |
+
"enhancer_Tissue_specific",
|
| 25 |
+
"enhancer_Tissue_invariant",
|
| 26 |
+
"promoter_Tissue_specific",
|
| 27 |
+
"promoter_Tissue_invariant"
|
| 28 |
+
],
|
| 29 |
"hidden_dropout_prob": 0.0,
|
| 30 |
"hidden_size": 1024,
|
| 31 |
"initializer_range": 0.02,
|
|
|
|
| 36 |
"max_position_embeddings": 2050,
|
| 37 |
"model_type": "esm",
|
| 38 |
"num_attention_heads": 16,
|
|
|
|
| 39 |
"num_hidden_layers": 29,
|
| 40 |
"num_layers_head": 2,
|
| 41 |
"pad_token_id": 1,
|
modeling_segment_nt.py
CHANGED
|
@@ -1159,6 +1159,7 @@ class SegmentNT(EsmPreTrainedModel):
|
|
| 1159 |
super().__init__(config)
|
| 1160 |
self.num_labels = config.num_labels
|
| 1161 |
self.config = config
|
|
|
|
| 1162 |
|
| 1163 |
self.esm = EsmModel(config, add_pooling_layer=False)
|
| 1164 |
|
|
@@ -1171,7 +1172,7 @@ class SegmentNT(EsmPreTrainedModel):
|
|
| 1171 |
embed_dim * (2**i) for i in range(num_layers)
|
| 1172 |
),
|
| 1173 |
)
|
| 1174 |
-
self.fc = nn.Linear(in_features=embed_dim, out_features=6 * 2 *
|
| 1175 |
self.activation_fn = nn.SiLU()
|
| 1176 |
|
| 1177 |
self.init_weights()
|
|
@@ -1232,7 +1233,7 @@ class SegmentNT(EsmPreTrainedModel):
|
|
| 1232 |
logits = self.fc(x)
|
| 1233 |
|
| 1234 |
# Final reshape to have logits per nucleotides, per feature
|
| 1235 |
-
logits = torch.reshape(logits, (x.shape[0], x.shape[1] * 6, self.
|
| 1236 |
|
| 1237 |
# Add logits to the ESM outputs
|
| 1238 |
outputs["logits"] = logits
|
|
|
|
| 1159 |
super().__init__(config)
|
| 1160 |
self.num_labels = config.num_labels
|
| 1161 |
self.config = config
|
| 1162 |
+
self.num_features = len(config.features)
|
| 1163 |
|
| 1164 |
self.esm = EsmModel(config, add_pooling_layer=False)
|
| 1165 |
|
|
|
|
| 1172 |
embed_dim * (2**i) for i in range(num_layers)
|
| 1173 |
),
|
| 1174 |
)
|
| 1175 |
+
self.fc = nn.Linear(in_features=embed_dim, out_features=6 * 2 * self.num_features)
|
| 1176 |
self.activation_fn = nn.SiLU()
|
| 1177 |
|
| 1178 |
self.init_weights()
|
|
|
|
| 1233 |
logits = self.fc(x)
|
| 1234 |
|
| 1235 |
# Final reshape to have logits per nucleotides, per feature
|
| 1236 |
+
logits = torch.reshape(logits, (x.shape[0], x.shape[1] * 6, self.num_features, 2))
|
| 1237 |
|
| 1238 |
# Add logits to the ESM outputs
|
| 1239 |
outputs["logits"] = logits
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2237478985
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae081cbe0ee351a510930a8d2d5a94e150c1e40afdb93e69fea5d345639ad2cf
|
| 3 |
size 2237478985
|
segment_nt_config.py
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
""" ESM model configuration"""
|
| 16 |
|
| 17 |
from dataclasses import asdict, dataclass
|
| 18 |
-
from typing import Optional
|
| 19 |
|
| 20 |
from transformers import PretrainedConfig, logging
|
| 21 |
|
|
@@ -99,6 +99,7 @@ class SegmentNTConfig(PretrainedConfig):
|
|
| 99 |
|
| 100 |
def __init__(
|
| 101 |
self,
|
|
|
|
| 102 |
vocab_size=None,
|
| 103 |
mask_token_id=None,
|
| 104 |
pad_token_id=None,
|
|
@@ -121,7 +122,6 @@ class SegmentNTConfig(PretrainedConfig):
|
|
| 121 |
add_bias_fnn=True,
|
| 122 |
rescaling_factor=None,
|
| 123 |
num_layers_head=2,
|
| 124 |
-
num_features=14,
|
| 125 |
**kwargs,
|
| 126 |
):
|
| 127 |
super().__init__(
|
|
@@ -147,7 +147,7 @@ class SegmentNTConfig(PretrainedConfig):
|
|
| 147 |
self.add_bias_fnn = add_bias_fnn
|
| 148 |
# Arguments needed for Segment NT
|
| 149 |
self.num_layers_head = num_layers_head
|
| 150 |
-
self.
|
| 151 |
self.rescaling_factor = rescaling_factor
|
| 152 |
if is_folding_model:
|
| 153 |
if esmfold_config is None:
|
|
|
|
| 15 |
""" ESM model configuration"""
|
| 16 |
|
| 17 |
from dataclasses import asdict, dataclass
|
| 18 |
+
from typing import List, Optional
|
| 19 |
|
| 20 |
from transformers import PretrainedConfig, logging
|
| 21 |
|
|
|
|
| 99 |
|
| 100 |
def __init__(
|
| 101 |
self,
|
| 102 |
+
features=None,
|
| 103 |
vocab_size=None,
|
| 104 |
mask_token_id=None,
|
| 105 |
pad_token_id=None,
|
|
|
|
| 122 |
add_bias_fnn=True,
|
| 123 |
rescaling_factor=None,
|
| 124 |
num_layers_head=2,
|
|
|
|
| 125 |
**kwargs,
|
| 126 |
):
|
| 127 |
super().__init__(
|
|
|
|
| 147 |
self.add_bias_fnn = add_bias_fnn
|
| 148 |
# Arguments needed for Segment NT
|
| 149 |
self.num_layers_head = num_layers_head
|
| 150 |
+
self.features = features
|
| 151 |
self.rescaling_factor = rescaling_factor
|
| 152 |
if is_folding_model:
|
| 153 |
if esmfold_config is None:
|