Spaces:
Sleeping
Sleeping
Kevin Wu
commited on
Commit
·
0e0266f
1
Parent(s):
6adea60
Initial
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ import re
|
|
9 |
import pandas as pd
|
10 |
import prompts
|
11 |
import traceback
|
|
|
12 |
|
13 |
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
14 |
|
@@ -31,15 +32,20 @@ def parse_xml_response(xml_string: str) -> pd.DataFrame:
|
|
31 |
then convert it to a pandas DataFrame with a nested index.
|
32 |
"""
|
33 |
try:
|
34 |
-
# Extract only the XML content between the
|
35 |
-
xml_content = re.search(r'
|
36 |
if xml_content:
|
37 |
xml_string = xml_content.group(0)
|
38 |
else:
|
39 |
print("No valid XML content found.")
|
40 |
return pd.DataFrame()
|
41 |
|
42 |
-
root
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
result = {}
|
45 |
|
@@ -113,35 +119,40 @@ def parse_xml_response(xml_string: str) -> pd.DataFrame:
|
|
113 |
print(f"Traceback: {traceback.format_exc()}")
|
114 |
return pd.DataFrame()
|
115 |
|
116 |
-
def get_response(file_id, assistant_id):
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
def process(file_content):
|
147 |
try:
|
@@ -153,9 +164,17 @@ def process(file_content):
|
|
153 |
|
154 |
message_file = client.files.create(file=open(file_name, "rb"), purpose="assistants")
|
155 |
|
156 |
-
response = get_response(message_file.id, demo.id)
|
157 |
df = parse_xml_response(response)
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
if df.empty:
|
160 |
return "<p>No valid information could be extracted from the provided file.</p>"
|
161 |
|
|
|
9 |
import pandas as pd
|
10 |
import prompts
|
11 |
import traceback
|
12 |
+
from io import StringIO
|
13 |
|
14 |
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
15 |
|
|
|
32 |
then convert it to a pandas DataFrame with a nested index.
|
33 |
"""
|
34 |
try:
|
35 |
+
# Extract only the XML content between the outermost tags
|
36 |
+
xml_content = re.search(r'<[^>]+>.*</[^>]+>', xml_string, re.DOTALL)
|
37 |
if xml_content:
|
38 |
xml_string = xml_content.group(0)
|
39 |
else:
|
40 |
print("No valid XML content found.")
|
41 |
return pd.DataFrame()
|
42 |
|
43 |
+
# Wrap the content in a root element to ensure there's only one root
|
44 |
+
xml_string = f"<root>{xml_string}</root>"
|
45 |
+
|
46 |
+
# Parse the XML
|
47 |
+
parser = ET.XMLParser(recover=True) # This allows for more lenient parsing
|
48 |
+
root = ET.fromstring(xml_string, parser=parser)
|
49 |
|
50 |
result = {}
|
51 |
|
|
|
119 |
print(f"Traceback: {traceback.format_exc()}")
|
120 |
return pd.DataFrame()
|
121 |
|
122 |
+
def get_response(file_id, assistant_id, max_retries=3):
|
123 |
+
for attempt in range(max_retries):
|
124 |
+
try:
|
125 |
+
thread = client.beta.threads.create(
|
126 |
+
messages=[
|
127 |
+
{
|
128 |
+
"role": "user",
|
129 |
+
"content": prompts.info_prompt,
|
130 |
+
"attachments": [
|
131 |
+
{"file_id": file_id, "tools": [{"type": "file_search"}]}
|
132 |
+
],
|
133 |
+
}
|
134 |
+
]
|
135 |
+
)
|
136 |
+
run = client.beta.threads.runs.create_and_poll(
|
137 |
+
thread_id=thread.id, assistant_id=assistant_id
|
138 |
+
)
|
139 |
+
messages = list(
|
140 |
+
client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id)
|
141 |
+
)
|
142 |
+
assert len(messages) == 1, f"Expected 1 message, got {len(messages)}"
|
143 |
+
message_content = messages[0].content[0].text
|
144 |
+
annotations = message_content.annotations
|
145 |
+
for index, annotation in enumerate(annotations):
|
146 |
+
message_content.value = message_content.value.replace(annotation.text, f"")
|
147 |
+
return message_content.value
|
148 |
+
except Exception as e:
|
149 |
+
print(f"Error in get_response (attempt {attempt + 1}): {str(e)}")
|
150 |
+
print(f"Traceback: {traceback.format_exc()}")
|
151 |
+
if attempt < max_retries - 1:
|
152 |
+
print(f"Retrying in 5 seconds...")
|
153 |
+
time.sleep(5)
|
154 |
+
else:
|
155 |
+
raise Exception("Max retries reached. Unable to get response from the model.")
|
156 |
|
157 |
def process(file_content):
|
158 |
try:
|
|
|
164 |
|
165 |
message_file = client.files.create(file=open(file_name, "rb"), purpose="assistants")
|
166 |
|
167 |
+
response = get_response(message_file.id, demo.id) # This now includes retry logic
|
168 |
df = parse_xml_response(response)
|
169 |
|
170 |
+
# ... (rest of the function remains the same)
|
171 |
+
|
172 |
+
except Exception as e:
|
173 |
+
error_message = f"An error occurred while processing the file: {str(e)}"
|
174 |
+
print(error_message)
|
175 |
+
print(f"Traceback: {traceback.format_exc()}")
|
176 |
+
return f"<p>{error_message}</p>"
|
177 |
+
|
178 |
if df.empty:
|
179 |
return "<p>No valid information could be extracted from the provided file.</p>"
|
180 |
|