remiai3 commited on
Commit
ce307e6
·
verified ·
1 Parent(s): 65b023f

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +28 -0
  2. main.py +27 -0
  3. remiai.png +0 -0
  4. requirements.txt +10 -0
README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Image Captioning (CPU/GPU)
2
+
3
+ - **Model:** `nlpconnect/vit-gpt2-image-captioning` (MIT)
4
+ - **Task:** Generate a caption for a given image.
5
+ - **Note:** Here we just provide the resources for to run this models in the laptops we didn't develop this entire models we just use the open source models for the experiment this model is developed by nlpconnect
6
+
7
+ ## Quick start (any project)
8
+
9
+ ```bash
10
+ # 1) Create env
11
+ python -m venv venv && source .venv/bin/activate # Windows: ./venv/Scripts/activate
12
+
13
+ # 2) Install deps
14
+ pip install -r requirements.txt
15
+
16
+ # 3) Run
17
+ python main.py --help
18
+ ```
19
+
20
+ > Tip: If you have a GPU + CUDA, PyTorch will auto-use it. If not, everything runs on CPU (slower but works).
21
+
22
+ ---
23
+ and while running the main.py code using command then only you the output
24
+ **Use:** python main.py --image remiai.png or python main.py --image sample.jpg
25
+
26
+ other wise you get the output like this
27
+ usage: main.py [-h] --image IMAGE
28
+ error: the following arguments are required: --image
main.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse, torch
2
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
+ from PIL import Image
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser()
7
+ parser.add_argument("--image", type=str, required=True)
8
+ parser.add_argument("--max_length", type=int, default=20)
9
+ args = parser.parse_args()
10
+
11
+ model_id = "nlpconnect/vit-gpt2-image-captioning"
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+
14
+ model = VisionEncoderDecoderModel.from_pretrained(model_id).to(device)
15
+ feature_extractor = ViTImageProcessor.from_pretrained(model_id)
16
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
17
+
18
+ img = Image.open(args.image).convert("RGB")
19
+ pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values.to(device)
20
+
21
+ with torch.no_grad():
22
+ output_ids = model.generate(pixel_values, max_length=args.max_length)[0]
23
+ caption = tokenizer.decode(output_ids, skip_special_tokens=True)
24
+ print(caption)
25
+
26
+ if __name__ == "__main__":
27
+ main()
remiai.png ADDED
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ torchvision>=0.15.0
3
+ transformers>=4.40.0
4
+ sentence-transformers>=2.2.2
5
+ easyocr>=1.7.1
6
+ openai-whisper>=20231106
7
+ TTS>=0.22.0
8
+ pillow>=9.5.0
9
+ accelerate>=0.26.0
10
+ numpy>=1.24.0