intfloat
/

mmE5-mllama-11b-instruct

@@ -69,14 +69,14 @@ model.eval()
 # Image + Text -> Text
 image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
-inputs = processor(text='<|image|><|begin_of_text|> Represent the given image with the following question: What is in the image', images=[image], return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
 string = 'A cat and a dog'
 text_inputs = processor(text=string, return_tensors="pt").to("cuda")
 tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
 print(string, '=', compute_similarity(qry_output, tgt_output))
-## A cat and a dog = tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
 string = 'A cat and a tiger'
 text_inputs = processor(text=string, return_tensors="pt").to("cuda")
@@ -88,19 +88,19 @@ print(string, '=', compute_similarity(qry_output, tgt_output))
 inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.', return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
-string = '<|image|><|begin_of_text|> Represent the given image.'
 tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
 tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
 print(string, '=', compute_similarity(qry_output, tgt_output))
-## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
 inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.', return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
-string = '<|image|><|begin_of_text|> Represent the given image.'
 tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
 tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
 print(string, '=', compute_similarity(qry_output, tgt_output))
-## <|image|><|begin_of_text|> Represent the given image. = tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
 ```
 ### Sentence Transformers

 # Image + Text -> Text
 image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
+inputs = processor(text='<|image|><|begin_of_text|>Represent the given image with the following question: What is in the image', images=[image], return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
 string = 'A cat and a dog'
 text_inputs = processor(text=string, return_tensors="pt").to("cuda")
 tgt_output = last_pooling(model(**text_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], text_inputs['attention_mask'])
 print(string, '=', compute_similarity(qry_output, tgt_output))
+## A cat and a dog = tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
 string = 'A cat and a tiger'
 text_inputs = processor(text=string, return_tensors="pt").to("cuda")
 inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a dog.', return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
+string = '<|image|><|begin_of_text|>Represent the given image.'
 tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
 tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
 print(string, '=', compute_similarity(qry_output, tgt_output))
+## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
 inputs = processor(text='Find me an everyday image that matches the given caption: A cat and a tiger.', return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
+string = '<|image|><|begin_of_text|>Represent the given image.'
 tgt_inputs = processor(text=string, images=[image], return_tensors="pt").to("cuda")
 tgt_output = last_pooling(model(**tgt_inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], tgt_inputs['attention_mask'])
 print(string, '=', compute_similarity(qry_output, tgt_output))
+## <|image|><|begin_of_text|>Represent the given image. = tensor([[0.3770]], device='cuda:0', dtype=torch.bfloat16)
 ```
 ### Sentence Transformers