Spaces:

TIGER-Lab
/

Pixel-Reasoner

Running on Zero

App Files Files Community

Haozhe commited on May 23

Commit

cbc410e

1 Parent(s): 7ba5930

update

Browse files

Files changed (1) hide show

app.py +29 -16

app.py CHANGED Viewed

@@ -7,12 +7,12 @@ import pickle as pkl
 import re
 from PIL import Image
 import json
-# import spaces
 from serve_constants import html_header, bibtext, learn_more_markdown, tos_markdown
-MODEL_ID = "TIGER-Lab/PixelReasoner-RL-v1"
-example_image = "example_images/1.jpg"
 # "example_images/document.png"
 example_text = "What kind of restaurant is it?"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True,
@@ -117,7 +117,7 @@ def parse_last_tool(output_text):
 tool_end = '</tool_call>'
 tool_start = '<tool_call>'
-# @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
     files = input_dict["files"]
@@ -171,7 +171,8 @@ def model_inference(input_dict, history):
     })
     print(messages)
-    complete_assistant_response_for_gradio = ""
     while True:
         """
         Generate and stream text
@@ -185,7 +186,7 @@ def model_inference(input_dict, history):
         ).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, temperature=0.01, top_p=1.0, top_k=1)
         # import pdb; pdb.set_trace()
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
@@ -196,20 +197,26 @@ def model_inference(input_dict, history):
         #     yield buffer
         # print(buffer)
         current_model_output_segment = "" # Text generated in this specific model call
         for new_text_chunk in streamer:
             current_model_output_segment += new_text_chunk
             # Yield the sum of previously committed full response parts + current streaming segment
-            yield complete_assistant_response_for_gradio + current_model_output_segment
-        tmp = f"\n<b>Planning Visual Operations ...</b>\n\n"
-        yield complete_assistant_response_for_gradio + current_model_output_segment.split(tool_start)[0] + tmp
         thread.join()
         # Process the full segment (e.g., remove <|im_end|>)
         processed_segment = current_model_output_segment.split("<|im_end|>", 1)[0] if "<|im_end|>" in current_model_output_segment else current_model_output_segment
         # Append this processed segment to the cumulative display string for Gradio
-        complete_assistant_response_for_gradio += processed_segment + "\n\n"
-        print(f"this one: {complete_assistant_response_for_gradio}")
         yield complete_assistant_response_for_gradio # Ensure the fully processed segment is yielded to Gradio
@@ -217,28 +224,34 @@ def model_inference(input_dict, history):
         qatext_for_tool_check = processed_segment
         require_tool = tool_end in qatext_for_tool_check and tool_start in qatext_for_tool_check
         if require_tool:
             tool_params = parse_last_tool(qatext_for_tool_check)
             tool_name = tool_params['name']
             tool_args = tool_params['arguments']
-            complete_assistant_response_for_gradio += f"\n<b>Executing Visual Operations ...</b> @{tool_name}({tool_args})\n\n"
             yield complete_assistant_response_for_gradio # Update Gradio display
             video_flag = False
             raw_result = execute_tool(imagelist, rawimagelist, tool_args, tool_name, is_video=video_flag)
             print(raw_result)
             proc_img = raw_result
             all_images += [proc_img]
             new_piece = dict(role='user', content=[
                                     dict(type='text', text="\nHere is the cropped image (Image Size: {}x{}):".format(proc_img.size[0], proc_img.size[1])),
                                     dict(type='image', image=proc_img)
                                 ]
             )
             messages.append(new_piece)
-            complete_assistant_response_for_gradio += f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"
             yield complete_assistant_response_for_gradio # Update Gradio display
@@ -267,4 +280,4 @@ with gr.Blocks() as demo:
     gr.Markdown(learn_more_markdown)
     gr.Markdown(bibtext)
-demo.launch(debug=True)

 import re
 from PIL import Image
 import json
+import spaces
 from serve_constants import html_header, bibtext, learn_more_markdown, tos_markdown
+MODEL_ID = "/home/ma-user/work/haozhe/workspace/lmm-r1/toolckpts/pix17K0506wt-NormalizedPenalizedFixedReweightCont-256-lossvernone-samplevernone-fmtnone-group-n8-ml10000-lr10-sysvcot-8node/global_step24_hf_evalbest"
+example_image = "/home/ma-user/work/haozhe/workspace/vlspaces/example_images/1.jpg"
 # "example_images/document.png"
 example_text = "What kind of restaurant is it?"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True,
 tool_end = '</tool_call>'
 tool_start = '<tool_call>'
+@spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
     files = input_dict["files"]
     })
     print(messages)
+    # complete_assistant_response_for_gradio = ""
+    complete_assistant_response_for_gradio = []
     while True:
         """
         Generate and stream text
         ).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
+        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, temperature=0.1, top_p=0.95, top_k=50)
         # import pdb; pdb.set_trace()
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         #     yield buffer
         # print(buffer)
         current_model_output_segment = "" # Text generated in this specific model call
+        toolflag = False
         for new_text_chunk in streamer:
             current_model_output_segment += new_text_chunk
             # Yield the sum of previously committed full response parts + current streaming segment
+            # yield complete_assistant_response_for_gradio + current_model_output_segment
+            if tool_start in current_model_output_segment:
+                toolflag = True
+                tmp = current_model_output_segment.split(tool_start)[0]
+                yield complete_assistant_response_for_gradio + [tmp+"\n\n<b>Planning Visual Operations ...</b>\n\n"]
+            if not toolflag:
+                yield complete_assistant_response_for_gradio + [current_model_output_segment]
         thread.join()
         # Process the full segment (e.g., remove <|im_end|>)
         processed_segment = current_model_output_segment.split("<|im_end|>", 1)[0] if "<|im_end|>" in current_model_output_segment else current_model_output_segment
         # Append this processed segment to the cumulative display string for Gradio
+        # complete_assistant_response_for_gradio += processed_segment + "\n\n"
+        complete_assistant_response_for_gradio += [processed_segment + "\n\n"]
+        # print(f"this one: {complete_assistant_response_for_gradio}")
         yield complete_assistant_response_for_gradio # Ensure the fully processed segment is yielded to Gradio
         qatext_for_tool_check = processed_segment
         require_tool = tool_end in qatext_for_tool_check and tool_start in qatext_for_tool_check
+        # print(f"Segment from model: \"{qatext_for_tool_check[:200]}...\", Requires tool: {require_tool}")
         if require_tool:
             tool_params = parse_last_tool(qatext_for_tool_check)
             tool_name = tool_params['name']
             tool_args = tool_params['arguments']
+            # complete_assistant_response_for_gradio += f"\n<b>Executing Visual Operations ...</b> @{tool_name}({tool_args})\n\n"
+            complete_assistant_response_for_gradio += [f"\n<b>Executing Visual Operations ...</b> @{tool_name}({tool_args})\n\n"]
             yield complete_assistant_response_for_gradio # Update Gradio display
             video_flag = False
             raw_result = execute_tool(imagelist, rawimagelist, tool_args, tool_name, is_video=video_flag)
             print(raw_result)
             proc_img = raw_result
             all_images += [proc_img]
+            # complete_assistant_response_for_gradio += [(proc_img, "Visual Operation Result")]
+            # yield complete_assistant_response_for_gradio # Update Gradio display
             new_piece = dict(role='user', content=[
                                     dict(type='text', text="\nHere is the cropped image (Image Size: {}x{}):".format(proc_img.size[0], proc_img.size[1])),
                                     dict(type='image', image=proc_img)
                                 ]
             )
             messages.append(new_piece)
+            # print(messages)
+            # complete_assistant_response_for_gradio += f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"
+            complete_assistant_response_for_gradio += [f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"]
             yield complete_assistant_response_for_gradio # Update Gradio display
     gr.Markdown(learn_more_markdown)
     gr.Markdown(bibtext)
+demo.launch(debug=True, share=True)