stepfun-ai
/

GOT-OCR2_0

@@ -484,7 +484,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None, print_prompt=False, gradio_input=False):
         self.disable_torch_init()
@@ -565,18 +565,30 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        with torch.autocast("cuda", dtype=torch.bfloat16):
-            output_ids = self.generate(
-                input_ids,
-                images=[image_tensor_1.unsqueeze(0).half().cuda()],
-                do_sample=False,
-                num_beams = 1,
-                no_repeat_ngram_size = 20,
-                # streamer=streamer,
-                max_new_tokens=4096,
-                stopping_criteria=[stopping_criteria]
-                )
             outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
@@ -716,7 +728,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         return processed_images
-    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False):
         # Model
         self.disable_torch_init()
         multi_page=False
@@ -807,18 +819,30 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        with torch.autocast("cuda", dtype=torch.bfloat16):
-            output_ids = self.generate(
-                input_ids,
-                images=[image_list.half().cuda()],
-                do_sample=False,
-                num_beams = 1,
-                # no_repeat_ngram_size = 20,
-                # streamer=streamer,
-                max_new_tokens=4096,
-                stopping_criteria=[stopping_criteria]
-                )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()

         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         self.disable_torch_init()
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        if stream_flag:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                output_ids = self.generate(
+                    input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
+                    do_sample=False,
+                    num_beams = 1,
+                    no_repeat_ngram_size = 20,
+                    streamer=streamer,
+                    max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria]
+                    )
+        else:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                output_ids = self.generate(
+                    input_ids,
+                    images=[image_tensor_1.unsqueeze(0).half().cuda()],
+                    do_sample=False,
+                    num_beams = 1,
+                    no_repeat_ngram_size = 20,
+                    # streamer=streamer,
+                    max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria]
+                    )
             outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
         return processed_images
+    def chat_crop(self, tokenizer, image_file, ocr_type, render=False, save_render_file=None, print_prompt=False, gradio_input=False, stream_flag = False):
         # Model
         self.disable_torch_init()
         multi_page=False
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
         streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        if stream_flag:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                output_ids = self.generate(
+                    input_ids,
+                    images=[image_list.half().cuda()],
+                    do_sample=False,
+                    num_beams = 1,
+                    # no_repeat_ngram_size = 20,
+                    streamer=streamer,
+                    max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria]
+                    )
+        else:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                output_ids = self.generate(
+                    input_ids,
+                    images=[image_list.half().cuda()],
+                    do_sample=False,
+                    num_beams = 1,
+                    # no_repeat_ngram_size = 20,
+                    # streamer=streamer,
+                    max_new_tokens=4096,
+                    stopping_criteria=[stopping_criteria]
+                    )
         outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()