shashankkandimalla commited on
Commit
7c3b051
·
verified ·
1 Parent(s): 704fe84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -49
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import gradio as gr
2
  import weaviate
3
  from weaviate.embedded import EmbeddedOptions
@@ -5,9 +9,6 @@ import os
5
  from openai import OpenAI
6
  from dotenv import load_dotenv
7
  import textwrap
8
- from functools import lru_cache
9
- import asyncio
10
- import aiohttp
11
 
12
  # Load environment variables
13
  load_dotenv()
@@ -27,20 +28,19 @@ client = weaviate.Client(
27
  # Get the collection name from environment variable
28
  COLLECTION_NAME = os.getenv('WEAVIATE_COLLECTION_NAME')
29
 
30
- @lru_cache(maxsize=1000)
31
  def get_embedding(text):
32
  response = openai_client.embeddings.create(
33
  input=text,
34
- model="text-embedding-3-small"
35
  )
36
  return response.data[0].embedding
37
 
38
- async def search_multimodal(query: str, limit: int = 30, alpha: float = 0.6):
39
  query_vector = get_embedding(query)
40
 
41
  try:
42
- response = await asyncio.to_thread(
43
- lambda: client.query
44
  .get(COLLECTION_NAME, ["content_type", "url", "source_document", "page_number",
45
  "paragraph_number", "text", "image_path", "description", "table_content"])
46
  .with_hybrid(
@@ -56,69 +56,70 @@ async def search_multimodal(query: str, limit: int = 30, alpha: float = 0.6):
56
  print(f"An error occurred during the search: {str(e)}")
57
  return []
58
 
59
- async def generate_response(query: str, context: str) -> str:
60
  prompt = f"""
61
  You are an AI assistant with extensive expertise in the semiconductor industry. Your knowledge spans a wide range of companies, technologies, and products, including but not limited to: System-on-Chip (SoC) designs, Field-Programmable Gate Arrays (FPGAs), Microcontrollers, Integrated Circuits (ICs), semiconductor manufacturing processes, and emerging technologies like quantum computing and neuromorphic chips.
62
  Use the following context, your vast knowledge, and the user's question to generate an accurate, comprehensive, and insightful answer. While formulating your response, follow these steps internally:
 
63
  Analyze the question to identify the main topic and specific information requested.
64
  Evaluate the provided context and identify relevant information.
65
  Retrieve additional relevant knowledge from your semiconductor industry expertise.
66
  Reason and formulate a response by combining context and knowledge.
67
  Generate a detailed response that covers all aspects of the query.
68
  Review and refine your answer for coherence and accuracy.
 
69
  In your output, provide only the final, polished response. Do not include your step-by-step reasoning or mention the process you followed.
70
  IMPORTANT: Ensure your response is grounded in factual information. Do not hallucinate or invent information. If you're unsure about any aspect of the answer or if the necessary information is not available in the provided context or your knowledge base, clearly state this uncertainty. It's better to admit lack of information than to provide inaccurate details.
71
  Your response should be:
 
72
  Thorough and directly address all aspects of the user's question
73
  Based solely on factual information from the provided context and your reliable knowledge
74
  Include specific examples, data points, or case studies only when you're certain of their accuracy
75
  Explain technical concepts clearly, considering the user may have varying levels of expertise
76
  Clearly indicate any areas where information is limited or uncertain
 
77
  Context: {context}
78
  User Question: {query}
79
  Based on the above context and your extensive knowledge of the semiconductor industry, provide your detailed, accurate, and grounded response below. Remember, only include information you're confident is correct, and clearly state any uncertainties:
80
  """
81
 
82
- async with aiohttp.ClientSession() as session:
83
- async with session.post(
84
- "https://api.openai.com/v1/chat/completions",
85
- headers={
86
- "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
87
- "Content-Type": "application/json"
88
- },
89
- json={
90
- "model": "gpt-4-turbo-preview",
91
- "messages": [
92
- {"role": "system", "content": "You are an expert Semi Conductor industry analyst"},
93
- {"role": "user", "content": prompt}
94
- ],
95
- "temperature": 0
96
- }
97
- ) as response:
98
- result = await response.json()
99
- return result['choices'][0]['message']['content']
100
-
101
- async def esg_analysis(user_query: str):
102
- search_results = await search_multimodal(user_query)
103
-
104
- context = "\n\n".join(
105
- f"{item['content_type'].capitalize()} from {item['source_document']} "
106
- f"(Page {item['page_number']}): {item.get('text') or item.get('description')}"
107
- for item in search_results
108
  )
109
 
110
- response = await generate_response(user_query, context)
111
 
112
- sources = [
113
- {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  "type": item.get("content_type", "Unknown"),
115
  "document": item.get("source_document", "N/A"),
116
  "page": item.get("page_number", "N/A"),
117
- "paragraph": item.get("paragraph_number", "N/A") if item.get("content_type") == 'text' else None,
118
- "image_path": item.get("image_path", "N/A") if item.get("content_type") == 'image' else None
119
  }
120
- for item in search_results[:5]
121
- ]
 
 
 
122
 
123
  return response, sources
124
 
@@ -126,14 +127,27 @@ def wrap_text(text, width=120):
126
  return textwrap.fill(text, width=width)
127
 
128
  def gradio_interface(user_question):
129
- ai_response, sources = asyncio.run(esg_analysis(user_question))
130
 
131
- formatted_response = f"## AI Response\n{ai_response}"
 
 
 
 
 
132
 
133
- source_text = "## Top 5 Sources\n\n" + "\n\n".join(
134
- f"### Source {i+1}\n" + "\n".join(f"- **{k.title()}:** {v}" for k, v in source.items() if v is not None)
135
- for i, source in enumerate(sources)
136
- )
 
 
 
 
 
 
 
 
137
 
138
  return formatted_response, source_text
139
 
@@ -146,7 +160,7 @@ iface = gr.Interface(
146
  ],
147
  title="Semiconductor Industry ESG Analysis",
148
  description="Ask questions about the semiconductor industry and get AI-powered answers with sources.",
149
- flagging_dir="/app/flagged"
150
  )
151
 
152
  if __name__ == "__main__":
 
1
+
2
+
3
+
4
+
5
  import gradio as gr
6
  import weaviate
7
  from weaviate.embedded import EmbeddedOptions
 
9
  from openai import OpenAI
10
  from dotenv import load_dotenv
11
  import textwrap
 
 
 
12
 
13
  # Load environment variables
14
  load_dotenv()
 
28
  # Get the collection name from environment variable
29
  COLLECTION_NAME = os.getenv('WEAVIATE_COLLECTION_NAME')
30
 
 
31
  def get_embedding(text):
32
  response = openai_client.embeddings.create(
33
  input=text,
34
+ model="text-embedding-3-large"
35
  )
36
  return response.data[0].embedding
37
 
38
+ def search_multimodal(query: str, limit: int = 30, alpha: float = 0.6):
39
  query_vector = get_embedding(query)
40
 
41
  try:
42
+ response = (
43
+ client.query
44
  .get(COLLECTION_NAME, ["content_type", "url", "source_document", "page_number",
45
  "paragraph_number", "text", "image_path", "description", "table_content"])
46
  .with_hybrid(
 
56
  print(f"An error occurred during the search: {str(e)}")
57
  return []
58
 
59
+ def generate_response(query: str, context: str) -> str:
60
  prompt = f"""
61
  You are an AI assistant with extensive expertise in the semiconductor industry. Your knowledge spans a wide range of companies, technologies, and products, including but not limited to: System-on-Chip (SoC) designs, Field-Programmable Gate Arrays (FPGAs), Microcontrollers, Integrated Circuits (ICs), semiconductor manufacturing processes, and emerging technologies like quantum computing and neuromorphic chips.
62
  Use the following context, your vast knowledge, and the user's question to generate an accurate, comprehensive, and insightful answer. While formulating your response, follow these steps internally:
63
+
64
  Analyze the question to identify the main topic and specific information requested.
65
  Evaluate the provided context and identify relevant information.
66
  Retrieve additional relevant knowledge from your semiconductor industry expertise.
67
  Reason and formulate a response by combining context and knowledge.
68
  Generate a detailed response that covers all aspects of the query.
69
  Review and refine your answer for coherence and accuracy.
70
+
71
  In your output, provide only the final, polished response. Do not include your step-by-step reasoning or mention the process you followed.
72
  IMPORTANT: Ensure your response is grounded in factual information. Do not hallucinate or invent information. If you're unsure about any aspect of the answer or if the necessary information is not available in the provided context or your knowledge base, clearly state this uncertainty. It's better to admit lack of information than to provide inaccurate details.
73
  Your response should be:
74
+
75
  Thorough and directly address all aspects of the user's question
76
  Based solely on factual information from the provided context and your reliable knowledge
77
  Include specific examples, data points, or case studies only when you're certain of their accuracy
78
  Explain technical concepts clearly, considering the user may have varying levels of expertise
79
  Clearly indicate any areas where information is limited or uncertain
80
+
81
  Context: {context}
82
  User Question: {query}
83
  Based on the above context and your extensive knowledge of the semiconductor industry, provide your detailed, accurate, and grounded response below. Remember, only include information you're confident is correct, and clearly state any uncertainties:
84
  """
85
 
86
+ response = openai_client.chat.completions.create(
87
+ model="gpt-4o",
88
+ messages=[
89
+ {"role": "system", "content": "You are an expert Semi Conductor industry analyst"},
90
+ {"role": "user", "content": prompt}
91
+ ],
92
+ temperature=0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  )
94
 
95
+ return response.choices[0].message.content
96
 
97
+ def esg_analysis(user_query: str):
98
+ search_results = search_multimodal(user_query)
99
+
100
+ context = ""
101
+ for item in search_results:
102
+ if item['content_type'] == 'text':
103
+ context += f"Text from {item['source_document']} (Page {item['page_number']}, Paragraph {item['paragraph_number']}): {item['text']}\n\n"
104
+ elif item['content_type'] == 'image':
105
+ context += f"Image Description from {item['source_document']} (Page {item['page_number']}, Path: {item['image_path']}): {item['description']}\n\n"
106
+ elif item['content_type'] == 'table':
107
+ context += f"Table Description from {item['source_document']} (Page {item['page_number']}): {item['description']}\n\n"
108
+
109
+ response = generate_response(user_query, context)
110
+
111
+ sources = []
112
+ for item in search_results[:5]: # Limit to top 5 sources
113
+ source = {
114
  "type": item.get("content_type", "Unknown"),
115
  "document": item.get("source_document", "N/A"),
116
  "page": item.get("page_number", "N/A"),
 
 
117
  }
118
+ if item.get("content_type") == 'text':
119
+ source["paragraph"] = item.get("paragraph_number", "N/A")
120
+ elif item.get("content_type") == 'image':
121
+ source["image_path"] = item.get("image_path", "N/A")
122
+ sources.append(source)
123
 
124
  return response, sources
125
 
 
127
  return textwrap.fill(text, width=width)
128
 
129
  def gradio_interface(user_question):
130
+ ai_response, sources = esg_analysis(user_question)
131
 
132
+ # Format AI response
133
+ formatted_response = f"""
134
+ ## AI Response
135
+
136
+ {ai_response}
137
+ """
138
 
139
+ # Format sources
140
+ source_text = "## Top 5 Sources\n\n"
141
+ for i, source in enumerate(sources, 1):
142
+ source_text += f"### Source {i}\n"
143
+ source_text += f"- **Type:** {source['type']}\n"
144
+ source_text += f"- **Document:** {source['document']}\n"
145
+ source_text += f"- **Page:** {source['page']}\n"
146
+ if 'paragraph' in source:
147
+ source_text += f"- **Paragraph:** {source['paragraph']}\n"
148
+ if 'image_path' in source:
149
+ source_text += f"- **Image Path:** {source['image_path']}\n"
150
+ source_text += "\n"
151
 
152
  return formatted_response, source_text
153
 
 
160
  ],
161
  title="Semiconductor Industry ESG Analysis",
162
  description="Ask questions about the semiconductor industry and get AI-powered answers with sources.",
163
+ flagging_dir="/app/flagged" # Specify the flagging directory
164
  )
165
 
166
  if __name__ == "__main__":