fidocure_note_extractor_3

Sleeping

App Files Files Community

Kevin Wu commited on Oct 9, 2024

Commit

0e0266f

1 Parent(s): 6adea60

Initial

Browse files

Files changed (1) hide show

app.py +52 -33

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import re
 import pandas as pd
 import prompts
 import traceback
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
@@ -31,15 +32,20 @@ def parse_xml_response(xml_string: str) -> pd.DataFrame:
     then convert it to a pandas DataFrame with a nested index.
     """
     try:
-        # Extract only the XML content between the first and last tags
-        xml_content = re.search(r'<.*?>.*</.*?>', xml_string, re.DOTALL)
         if xml_content:
             xml_string = xml_content.group(0)
         else:
             print("No valid XML content found.")
             return pd.DataFrame()
-        root = ET.fromstring(xml_string)
         result = {}
@@ -113,35 +119,40 @@ def parse_xml_response(xml_string: str) -> pd.DataFrame:
         print(f"Traceback: {traceback.format_exc()}")
         return pd.DataFrame()
-def get_response(file_id, assistant_id):
-    try:
-        thread = client.beta.threads.create(
-            messages=[
-                {
-                    "role": "user",
-                    "content": prompts.info_prompt,
-                    "attachments": [
-                        {"file_id": file_id, "tools": [{"type": "file_search"}]}
-                    ],
-                }
-            ]
-        )
-        run = client.beta.threads.runs.create_and_poll(
-            thread_id=thread.id, assistant_id=assistant_id
-        )
-        messages = list(
-            client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id)
-        )
-        assert len(messages) == 1, f"Expected 1 message, got {len(messages)}"
-        message_content = messages[0].content[0].text
-        annotations = message_content.annotations
-        for index, annotation in enumerate(annotations):
-            message_content.value = message_content.value.replace(annotation.text, f"")
-        return message_content.value
-    except Exception as e:
-        print(f"Error in get_response: {str(e)}")
-        print(f"Traceback: {traceback.format_exc()}")
-        raise
 def process(file_content):
     try:
@@ -153,9 +164,17 @@ def process(file_content):
         message_file = client.files.create(file=open(file_name, "rb"), purpose="assistants")
-        response = get_response(message_file.id, demo.id)
         df = parse_xml_response(response)
         if df.empty:
             return "<p>No valid information could be extracted from the provided file.</p>"

 import pandas as pd
 import prompts
 import traceback
+from io import StringIO
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
     then convert it to a pandas DataFrame with a nested index.
     """
     try:
+        # Extract only the XML content between the outermost tags
+        xml_content = re.search(r'<[^>]+>.*</[^>]+>', xml_string, re.DOTALL)
         if xml_content:
             xml_string = xml_content.group(0)
         else:
             print("No valid XML content found.")
             return pd.DataFrame()
+        # Wrap the content in a root element to ensure there's only one root
+        xml_string = f"<root>{xml_string}</root>"
+        # Parse the XML
+        parser = ET.XMLParser(recover=True)  # This allows for more lenient parsing
+        root = ET.fromstring(xml_string, parser=parser)
         result = {}
         print(f"Traceback: {traceback.format_exc()}")
         return pd.DataFrame()
+def get_response(file_id, assistant_id, max_retries=3):
+    for attempt in range(max_retries):
+        try:
+            thread = client.beta.threads.create(
+                messages=[
+                    {
+                        "role": "user",
+                        "content": prompts.info_prompt,
+                        "attachments": [
+                            {"file_id": file_id, "tools": [{"type": "file_search"}]}
+                        ],
+                    }
+                ]
+            )
+            run = client.beta.threads.runs.create_and_poll(
+                thread_id=thread.id, assistant_id=assistant_id
+            )
+            messages = list(
+                client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id)
+            )
+            assert len(messages) == 1, f"Expected 1 message, got {len(messages)}"
+            message_content = messages[0].content[0].text
+            annotations = message_content.annotations
+            for index, annotation in enumerate(annotations):
+                message_content.value = message_content.value.replace(annotation.text, f"")
+            return message_content.value
+        except Exception as e:
+            print(f"Error in get_response (attempt {attempt + 1}): {str(e)}")
+            print(f"Traceback: {traceback.format_exc()}")
+            if attempt < max_retries - 1:
+                print(f"Retrying in 5 seconds...")
+                time.sleep(5)
+            else:
+                raise Exception("Max retries reached. Unable to get response from the model.")
 def process(file_content):
     try:
         message_file = client.files.create(file=open(file_name, "rb"), purpose="assistants")
+        response = get_response(message_file.id, demo.id)  # This now includes retry logic
         df = parse_xml_response(response)
+        # ... (rest of the function remains the same)
+    except Exception as e:
+        error_message = f"An error occurred while processing the file: {str(e)}"
+        print(error_message)
+        print(f"Traceback: {traceback.format_exc()}")
+        return f"<p>{error_message}</p>"
         if df.empty:
             return "<p>No valid information could be extracted from the provided file.</p>"