llamaindex
/

vdr-2b-multi-v1

sentence-transformers

text-generation-inference

Model card Files Files and versions

cheesyFishes commited on Jan 9

Commit

47f5e7c

·

verified ·

1 Parent(s): 6187d4b

make flash attention optional

Files changed (1) hide show

custom_st.py +19 -9

custom_st.py CHANGED Viewed

@@ -32,15 +32,25 @@ class Transformer(nn.Module):
         self.max_pixels = max_pixels
         self.min_pixels = min_pixels
-        # Initialize model
-        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
-            model_name_or_path,
-            attn_implementation="flash_attention_2",
-            torch_dtype=torch.bfloat16,
-            device_map=device,
-            cache_dir=cache_dir,
-            **kwargs
-        ).eval()
         # Initialize processor
         self.processor = AutoProcessor.from_pretrained(

         self.max_pixels = max_pixels
         self.min_pixels = min_pixels
+        # Try to use flash attention if available, fallback to default attention if not
+        try:
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_name_or_path,
+                attn_implementation="flash_attention_2",
+                torch_dtype=torch.bfloat16,
+                device_map=device,
+                cache_dir=cache_dir,
+                **kwargs
+            ).eval()
+        except (ImportError, ValueError) as e:
+            print(f"Flash attention not available, falling back to default attention: {e}")
+            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                model_name_or_path,
+                torch_dtype=torch.bfloat16,
+                device_map=device,
+                cache_dir=cache_dir,
+                **kwargs
+            ).eval()
         # Initialize processor
         self.processor = AutoProcessor.from_pretrained(