mgbam commited on
Commit
f9bd215
·
verified ·
1 Parent(s): bbea25a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -144
app.py CHANGED
@@ -1,59 +1,30 @@
1
- import gradio as gr
2
- import base64
3
  import os
4
  import re
5
- from io import BytesIO
 
 
 
 
6
  from PIL import Image
 
 
7
  from huggingface_hub import InferenceClient
8
  from mistralai import Mistral
9
- from feifeilib.feifeichat import feifeichat # Assuming this utility is still relevant or replace with SmartDocAnalyzer logic as needed.
10
 
11
- # Initialize Hugging Face inference clients
12
  client = InferenceClient(api_key=os.getenv('HF_TOKEN'))
13
  client.headers["x-use-cache"] = "0"
14
-
15
  api_key = os.getenv("MISTRAL_API_KEY")
16
  Mistralclient = Mistral(api_key=api_key)
17
 
18
- # Gradio interface setup for SmartDocAnalyzer
19
- SmartDocAnalyzer = gr.ChatInterface(
20
- feifeichat, # This should be replaced with a suitable function for SmartDocAnalyzer if needed.
21
- type="messages",
22
- multimodal=True,
23
- additional_inputs=[
24
- gr.Checkbox(label="Enable Analyzer Mode", value=True),
25
- gr.Dropdown(
26
- [
27
- "meta-llama/Llama-3.3-70B-Instruct",
28
- "CohereForAI/c4ai-command-r-plus-08-2024",
29
- "Qwen/Qwen2.5-72B-Instruct",
30
- "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
31
- "NousResearch/Hermes-3-Llama-3.1-8B",
32
- "mistralai/Mistral-Nemo-Instruct-2411",
33
- "microsoft/phi-4"
34
- ],
35
- value="mistralai/Mistral-Nemo-Instruct-2411",
36
- show_label=False,
37
- container=False
38
- ),
39
- gr.Radio(
40
- ["pixtral", "Vision"],
41
- value="pixtral",
42
- show_label=False,
43
- container=False
44
- )
45
- ],
46
- title="SmartDocAnalyzer",
47
- description="An advanced document analysis tool powered by AI."
48
- )
49
-
50
- SmartDocAnalyzer.launch()
51
 
52
  def encode_image(image_path):
53
- """
54
- Encode the image at the given path to a base64 JPEG.
55
- Resizes image height to 512 pixels while maintaining aspect ratio.
56
- """
57
  try:
58
  image = Image.open(image_path).convert("RGB")
59
  base_height = 512
@@ -63,68 +34,109 @@ def encode_image(image_path):
63
  buffered = BytesIO()
64
  image.save(buffered, format="JPEG")
65
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
66
- except FileNotFoundError:
67
- print(f"Error: The file {image_path} was not found.")
68
  except Exception as e:
69
- print(f"Error: {e}")
70
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- def feifeiprompt(feifei_select=True, message_text="", history=""):
 
 
 
 
 
 
 
 
 
 
 
73
  """
74
- Constructs a prompt for the chatbot based on message text and history.
75
- Enhancements for SmartDocAnalyzer context can be added here.
76
  """
77
- input_prompt = []
78
- # Special handling for drawing requests
79
- if message_text.startswith("画") or message_text.startswith("draw"):
80
- feifei_photo = (
81
- "You are FeiFei. Background: FeiFei was born in Tokyo and is a natural-born photographer, "
82
- "hailing from a family with a long history in photography... [truncated for brevity]"
83
- )
84
- message_text = message_text.replace("画", "").replace("draw", "")
85
- message_text = f"提示词是'{message_text}',根据提示词帮我生成一张高质量照片的一句话英文回复"
86
- system_prompt = {"role": "system", "content": feifei_photo}
87
- user_input_part = {"role": "user", "content": str(message_text)}
88
- return [system_prompt, user_input_part]
89
-
90
- # Default prompt construction for FeiFei character
91
- if feifei_select:
92
- feifei = (
93
- "[Character Name]: Aifeifei (AI Feifei) [Gender]: Female [Age]: 19 years old ... "
94
- "[Identity]: User's virtual girlfriend"
95
- )
96
- system_prompt = {"role": "system", "content": feifei}
97
- user_input_part = {"role": "user", "content": str(message_text)}
98
-
99
- pattern = re.compile(r"gradio")
100
- if history:
101
- history = [item for item in history if not pattern.search(str(item["content"]))]
102
- input_prompt = [system_prompt] + history + [user_input_part]
103
- else:
104
- input_prompt = [system_prompt, user_input_part]
105
- else:
106
- input_prompt = [{"role": "user", "content": str(message_text)}]
107
 
108
- return input_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- def feifeiimgprompt(message_files, message_text, image_mod):
111
  """
112
- Handles image-based prompts for either 'Vision' or 'pixtral' modes.
 
113
  """
114
- message_file = message_files[0]
115
- base64_image = encode_image(message_file)
116
- if base64_image is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  return
118
 
119
- # Vision mode using meta-llama model
 
 
 
 
 
 
 
120
  if image_mod == "Vision":
121
- messages = [{
122
- "role": "user",
123
- "content": [
124
- {"type": "text", "text": message_text},
125
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
126
- ]
127
- }]
128
  stream = client.chat.completions.create(
129
  model="meta-llama/Llama-3.2-11B-Vision-Instruct",
130
  messages=messages,
@@ -133,69 +145,68 @@ def feifeiimgprompt(message_files, message_text, image_mod):
133
  )
134
  temp = ""
135
  for chunk in stream:
136
- if chunk.choices[0].delta.content is not None:
137
  temp += chunk.choices[0].delta.content
138
  yield temp
139
- # Pixtral mode using Mistral model
140
  else:
141
  model = "pixtral-large-2411"
142
- messages = [{
143
- "role": "user",
144
- "content": [
145
- {"type": "text", "text": message_text},
146
- {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
147
- ]
148
- }]
149
  partial_message = ""
150
  for chunk in Mistralclient.chat.stream(model=model, messages=messages):
151
- if chunk.data.choices[0].delta.content is not None:
152
  partial_message += chunk.data.choices[0].delta.content
153
  yield partial_message
154
 
155
- def feifeichatmod(additional_dropdown, input_prompt):
156
  """
157
- Chooses the appropriate chat model based on the dropdown selection.
158
- """
159
- if additional_dropdown == "mistralai/Mistral-Nemo-Instruct-2411":
160
- model = "mistral-large-2411"
161
- stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt)
162
- partial_message = ""
163
- for chunk in stream_response:
164
- if chunk.data.choices[0].delta.content is not None:
165
- partial_message += chunk.data.choices[0].delta.content
166
- yield partial_message
167
- else:
168
- stream = client.chat.completions.create(
169
- model=additional_dropdown,
170
- messages=input_prompt,
171
- temperature=0.5,
172
- max_tokens=1024,
173
- top_p=0.7,
174
- stream=True
175
- )
176
- temp = ""
177
- for chunk in stream:
178
- if chunk.choices[0].delta.content is not None:
179
- temp += chunk.choices[0].delta.content
180
- yield temp
181
-
182
- def feifeichat(message, history, feifei_select, additional_dropdown, image_mod):
183
- """
184
- Main chat function that decides between image-based and text-based handling.
185
- This function can be further enhanced for SmartDocAnalyzer-specific logic.
186
  """
187
  message_text = message.get("text", "")
188
  message_files = message.get("files", [])
189
 
190
  if message_files:
191
- # Process image input
192
- yield from feifeiimgprompt(message_files, message_text, image_mod)
 
193
  else:
194
- # Process text input
195
- input_prompt = feifeiprompt(feifei_select, message_text, history)
196
- yield from feifeichatmod(additional_dropdown, input_prompt)
197
-
198
- # Enhancement Note:
199
- # For the SmartDocAnalyzer space, consider integrating document parsing,
200
- # OCR functionalities, semantic analysis of documents, and more advanced
201
- # error handling as needed. This template serves as a starting point.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
+ import base64
4
+ import gradio as gr
5
+ import pdfplumber # For PDF document parsing
6
+ import fitz # PyMuPDF for advanced PDF handling (alternative to pdfplumber)
7
+ import pytesseract # OCR for extracting text from images
8
  from PIL import Image
9
+ from io import BytesIO
10
+ from transformers import pipeline # For semantic analysis tasks
11
  from huggingface_hub import InferenceClient
12
  from mistralai import Mistral
 
13
 
14
+ # Initialize inference clients for different models
15
  client = InferenceClient(api_key=os.getenv('HF_TOKEN'))
16
  client.headers["x-use-cache"] = "0"
 
17
  api_key = os.getenv("MISTRAL_API_KEY")
18
  Mistralclient = Mistral(api_key=api_key)
19
 
20
+ # Initialize semantic analysis pipelines using transformers (for local tasks)
21
+ # Example: summarization, sentiment-analysis, named-entity-recognition, etc.
22
+ summarizer = pipeline("summarization")
23
+ sentiment_analyzer = pipeline("sentiment-analysis")
24
+ ner_tagger = pipeline("ner")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def encode_image(image_path):
27
+ """Resizes and encodes an image to base64."""
 
 
 
28
  try:
29
  image = Image.open(image_path).convert("RGB")
30
  base_height = 512
 
34
  buffered = BytesIO()
35
  image.save(buffered, format="JPEG")
36
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
 
 
37
  except Exception as e:
38
+ print(f"Image encoding error: {e}")
39
+ return None
40
+
41
+ def extract_text_from_document(file_path):
42
+ """Extracts text from a PDF or image document."""
43
+ text = ""
44
+ # Try PDF parsing with pdfplumber
45
+ if file_path.lower().endswith(".pdf"):
46
+ try:
47
+ with pdfplumber.open(file_path) as pdf:
48
+ for page in pdf.pages:
49
+ text += page.extract_text() + "\n"
50
+ return text.strip()
51
+ except Exception as e:
52
+ print(f"PDF parsing error: {e}")
53
+
54
+ # If not PDF or parsing fails, attempt OCR on the first page of an image-based PDF or an image file.
55
+ try:
56
+ # Open the file as an image for OCR
57
+ image = Image.open(file_path)
58
+ text = pytesseract.image_to_string(image)
59
+ except Exception as e:
60
+ print(f"OCR error: {e}")
61
+ return text.strip()
62
 
63
+ def perform_semantic_analysis(text, analysis_type):
64
+ """Applies semantic analysis tasks to the provided text."""
65
+ if analysis_type == "Summarization":
66
+ return summarizer(text, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
67
+ elif analysis_type == "Sentiment Analysis":
68
+ return sentiment_analyzer(text)[0]
69
+ elif analysis_type == "Named Entity Recognition":
70
+ return ner_tagger(text)
71
+ # Add more analysis types as needed
72
+ return text
73
+
74
+ def process_text_input(message_text, history, model_choice, analysis_type):
75
  """
76
+ Process text-based inputs using selected model and apply semantic analysis if requested.
 
77
  """
78
+ # Optionally perform semantic analysis before sending to the model
79
+ if analysis_type and analysis_type != "None":
80
+ analysis_result = perform_semantic_analysis(message_text, analysis_type)
81
+ # Incorporate analysis_result into prompt or display separately
82
+ message_text += f"\n\n[Analysis Result]: {analysis_result}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ # Construct a prompt for model inference
85
+ input_prompt = [{"role": "user", "content": message_text}]
86
+
87
+ if model_choice == "mistralai/Mistral-Nemo-Instruct-2411":
88
+ model = "mistral-large-2411"
89
+ stream_response = Mistralclient.chat.stream(model=model, messages=input_prompt)
90
+ for chunk in stream_response:
91
+ if chunk.data.choices[0].delta.content:
92
+ yield chunk.data.choices[0].delta.content
93
+ else:
94
+ stream = client.chat.completions.create(
95
+ model=model_choice,
96
+ messages=input_prompt,
97
+ temperature=0.5,
98
+ max_tokens=1024,
99
+ top_p=0.7,
100
+ stream=True
101
+ )
102
+ temp = ""
103
+ for chunk in stream:
104
+ if chunk.choices[0].delta.content:
105
+ temp += chunk.choices[0].delta.content
106
+ yield temp
107
 
108
+ def process_image_input(image_file, message_text, image_mod, model_choice, analysis_type):
109
  """
110
+ Process image-based inputs using selected model and mode.
111
+ Applies OCR if needed and semantic analysis.
112
  """
113
+ # Save uploaded image temporarily to extract text if necessary
114
+ temp_image_path = "temp_upload.jpg"
115
+ image_file.save(temp_image_path)
116
+
117
+ # Extract text from document/image using OCR if needed
118
+ extracted_text = extract_text_from_document(temp_image_path)
119
+ if extracted_text:
120
+ message_text += f"\n\n[Extracted Text]: {extracted_text}"
121
+ # Optionally perform semantic analysis on the extracted text
122
+ if analysis_type and analysis_type != "None":
123
+ analysis_result = perform_semantic_analysis(extracted_text, analysis_type)
124
+ message_text += f"\n\n[Analysis Result]: {analysis_result}"
125
+
126
+ base64_image = encode_image(temp_image_path)
127
+ if not base64_image:
128
+ yield "Failed to process image."
129
  return
130
 
131
+ messages = [{
132
+ "role": "user",
133
+ "content": [
134
+ {"type": "text", "text": message_text},
135
+ {"type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}"}
136
+ ]
137
+ }]
138
+
139
  if image_mod == "Vision":
 
 
 
 
 
 
 
140
  stream = client.chat.completions.create(
141
  model="meta-llama/Llama-3.2-11B-Vision-Instruct",
142
  messages=messages,
 
145
  )
146
  temp = ""
147
  for chunk in stream:
148
+ if chunk.choices[0].delta.content:
149
  temp += chunk.choices[0].delta.content
150
  yield temp
 
151
  else:
152
  model = "pixtral-large-2411"
 
 
 
 
 
 
 
153
  partial_message = ""
154
  for chunk in Mistralclient.chat.stream(model=model, messages=messages):
155
+ if chunk.data.choices[0].delta.content:
156
  partial_message += chunk.data.choices[0].delta.content
157
  yield partial_message
158
 
159
+ def multimodal_response(message, history, analyzer_mode, model_choice, image_mod, analysis_type):
160
  """
161
+ Main response function that handles text and image inputs, applies parsing, OCR, and semantic analysis.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  """
163
  message_text = message.get("text", "")
164
  message_files = message.get("files", [])
165
 
166
  if message_files:
167
+ # If an image/document is uploaded, process it
168
+ image_file = message_files[0]
169
+ yield from process_image_input(image_file, message_text, image_mod, model_choice, analysis_type)
170
  else:
171
+ # Process plain text inputs
172
+ yield from process_text_input(message_text, history, model_choice, analysis_type)
173
+
174
+ # Set up the Gradio interface with additional user customization options
175
+ MultiModalAnalyzer = gr.ChatInterface(
176
+ fn=multimodal_response,
177
+ type="messages",
178
+ multimodal=True,
179
+ additional_inputs=[
180
+ gr.Checkbox(label="Enable Analyzer Mode", value=True),
181
+ gr.Dropdown(
182
+ choices=[
183
+ "meta-llama/Llama-3.3-70B-Instruct",
184
+ "CohereForAI/c4ai-command-r-plus-08-2024",
185
+ "Qwen/Qwen2.5-72B-Instruct",
186
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
187
+ "NousResearch/Hermes-3-Llama-3.1-8B",
188
+ "mistralai/Mistral-Nemo-Instruct-2411",
189
+ "microsoft/phi-4"
190
+ ],
191
+ value="mistralai/Mistral-Nemo-Instruct-2411",
192
+ show_label=False,
193
+ container=False
194
+ ),
195
+ gr.Radio(
196
+ choices=["pixtral", "Vision"],
197
+ value="pixtral",
198
+ show_label=False,
199
+ container=False
200
+ ),
201
+ gr.Dropdown(
202
+ choices=["None", "Summarization", "Sentiment Analysis", "Named Entity Recognition"],
203
+ value="None",
204
+ label="Select Analysis Type",
205
+ container=False
206
+ )
207
+ ],
208
+ title="MultiModal Analyzer",
209
+ description="Upload documents or images, select a model and analysis type to interact with your content."
210
+ )
211
+
212
+ MultiModalAnalyzer.launch()