stepfun-ai
/

GOT-OCR2_0

@@ -249,7 +249,7 @@ class GOTQwenModel(Qwen2Model):
                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
-                        print(image_p.shape)
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
@@ -257,7 +257,6 @@ class GOTQwenModel(Qwen2Model):
                         image_patches_features.append(image_feature_p)
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
-            exit()
             dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
@@ -485,7 +484,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
-    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None):
         self.disable_torch_init()
@@ -549,7 +548,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
-        print(prompt)
         inputs = tokenizer([prompt])
@@ -570,7 +570,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 do_sample=False,
                 num_beams = 1,
                 no_repeat_ngram_size = 20,
-                streamer=streamer,
                 max_new_tokens=4096,
                 stopping_criteria=[stopping_criteria]
                 )
@@ -715,7 +715,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         return processed_images
-    def chat_plus(self, tokenizer, image_file_list, render=False, save_render_file=None):
         # Model
         self.disable_torch_init()
         multi_page=False
@@ -730,8 +730,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         image_list = []
-        if len(image_file_list)>1:
-            multi_page = True
         if multi_page:
             qs = 'OCR with format across multi pages: '
@@ -739,19 +739,19 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
             import glob
             # from natsort import natsorted
             # patches = glob.glob(image_file + '/*png')
-            patches = image_file_list
             # patches = natsorted(patches)
             sub_images = []
             for sub_image in patches:
                 sub_images.append(self.load_image(sub_image))
             ll = len(patches)
-            print(patches)
-            print("len ll: ", ll)
         else:
             qs = 'OCR with format upon the patch reference: '
-            img = self.load_image(image_file_list[0])
             sub_images = self.dynamic_preprocess(img)
             ll = len(sub_images)
@@ -762,7 +762,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         image_list = torch.stack(image_list)
-        print('====new images batch size======:  ',image_list.shape)
         if use_im_start_end:
@@ -788,7 +788,8 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
-        print(prompt)
         inputs = tokenizer([prompt])
@@ -807,7 +808,7 @@ class GOTQwenForCausalLM(Qwen2ForCausalLM):
                 do_sample=False,
                 num_beams = 1,
                 # no_repeat_ngram_size = 20,
-                streamer=streamer,
                 max_new_tokens=4096,
                 stopping_criteria=[stopping_criteria]
                 )

                     image_patches_features = []
                     for image_patch in image_patches:
                         image_p = torch.stack([image_patch])
                         with torch.set_grad_enabled(False):
                             cnn_feature_p = vision_tower_high(image_p)
                             cnn_feature_p = cnn_feature_p.flatten(2).permute(0, 2, 1)
                         image_patches_features.append(image_feature_p)
                     image_feature = torch.cat(image_patches_features, dim=1)
                     image_features.append(image_feature)
             dummy_image_features_2 = torch.zeros(256, 1024, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
         setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
         setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    def chat(self, tokenizer, image_file, ocr_type, ocr_box='', ocr_color='', render=False, save_render_file=None, print_prompt=False):
         self.disable_torch_init()
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
+        if print_prompt:
+            print(prompt)
         inputs = tokenizer([prompt])
                 do_sample=False,
                 num_beams = 1,
                 no_repeat_ngram_size = 20,
+                # streamer=streamer,
                 max_new_tokens=4096,
                 stopping_criteria=[stopping_criteria]
                 )
         return processed_images
+    def chat_plus(self, tokenizer, image_file, render=False, save_render_file=None, print_prompt=False):
         # Model
         self.disable_torch_init()
         multi_page=False
         image_list = []
+        # if len(image_file_list)>1:
+        #     multi_page = True
         if multi_page:
             qs = 'OCR with format across multi pages: '
             import glob
             # from natsort import natsorted
             # patches = glob.glob(image_file + '/*png')
+            patches = image_file
             # patches = natsorted(patches)
             sub_images = []
             for sub_image in patches:
                 sub_images.append(self.load_image(sub_image))
             ll = len(patches)
+            # print(patches)
+            # print("len ll: ", ll)
         else:
             qs = 'OCR with format upon the patch reference: '
+            img = self.load_image(image_file)
             sub_images = self.dynamic_preprocess(img)
             ll = len(sub_images)
         image_list = torch.stack(image_list)
+        print('====new images batch size======:  \n',image_list.shape)
         if use_im_start_end:
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
+        if print_prompt:
+            print(prompt)
         inputs = tokenizer([prompt])
                 do_sample=False,
                 num_beams = 1,
                 # no_repeat_ngram_size = 20,
+                # streamer=streamer,
                 max_new_tokens=4096,
                 stopping_criteria=[stopping_criteria]
                 )