modularStarEncoder
/

ModularStarEncoder

Feature Extraction

ModularStarEncoder

Model card Files Files and versions

andreagurioli1995 commited on Feb 21

Commit

13ccddb

·

verified ·

1 Parent(s): 4bc5dc5

Update README.md

Files changed (1) hide show

README.md +16 -20

README.md CHANGED Viewed

@@ -1,9 +1,9 @@
----
-library_name: transformers
-datasets:
-- bigcode/the-stack-v2
-license: bigcode-openrail-m
----
 # Model Card for Model ID
@@ -29,24 +29,17 @@ from transformers import AutoModel
 from transformers import AutoTokenizer
 #import the model
-model = AutoModel.from_pretrained("andreagurioli1995/ModularStarEncoder-finetuned", trust_remote_code=True)
 #import the tokenizer
-tokenizer = AutoTokenizer.from_pretrained("andreagurioli1995/ModularStarEncoder-finetuned")
-language = "yourlanguagelowercased"
-#instruction in case of code embedding in a code language
-instruction_code = f"Represent this {language} code snippet for retrieval:"
-#instruction in case of code embedding in English
-instruction_natural_language = "Represent this code description for retrieving supporting snippets of code:"
 code_snippet = "your code to embed here"
-#You should follow this pattern to embed a snippet of code or natural language queries
-sentence =  f"{tokenizer.sep_token}{instruction_code}{tokenizer.sep_token}{code_snippet)}{tokenizer.cls_token}"
 #Tokenizing your sentence
 tokenized_sensence = tokenizer(sentence, return_tensors="pt",truncation=True, max_length=2048)
@@ -55,10 +48,13 @@ tokenized_sensence = tokenizer(sentence, return_tensors="pt",truncation=True, ma
 embedded_sentence = model(**sentence)
 ```
-You will get as an output three elements:
-- projected_pooled_normalized: a list of the projected, pooled, and normalized embeddings from the five exit points;
-- raw_hidden_states: raw representation from all the hidden states of the model, without pooling, normalization, and projection
 - attentions: attention scores from the encoder
 ### Model Description

+---
+library_name: transformers
+datasets:
+- bigcode/the-stack-v2
+license: bigcode-openrail-m
+---
 # Model Card for Model ID
 from transformers import AutoTokenizer
 #import the model
+model = AutoModel.from_pretrained("andreagurioli1995/ModularStarEncoder", trust_remote_code=True)
 #import the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("andreagurioli1995/ModularStarEncoder")
 code_snippet = "your code to embed here"
+#You should follow this pattern to embed a snippet of code
+sentence =  f"{tokenizer.sep_token}{code_snippet}{tokenizer.cls_token}
 #Tokenizing your sentence
 tokenized_sensence = tokenizer(sentence, return_tensors="pt",truncation=True, max_length=2048)
 embedded_sentence = model(**sentence)
 ```
+You will get as an output six elements:
+- last_hidden_state: the representation of the last hidden state from the model;
+- hidden_states: raw representation from all the hidden states of the model, without pooling, normalization, and projection
+- loss: loss value if a ground truth is given (None if used in inference)
+- prediction_logits: prediction scores from masked language modeling head
+- seq_relationship_scores: prediction scores of in-context loss (concatenate multiple samples with the separator token if you want a meaningful score)
 - attentions: attention scores from the encoder
 ### Model Description