Spaces:

Engr-Saeed
/

RAG_System

Sleeping

Engr-Saeed commited on Aug 21, 2024

Commit

6abe79a

verified ·

1 Parent(s): d76f892

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -23,23 +23,32 @@ genai.configure(api_key=api_key)
 # Step 4: Function to read files and extract text
 def extract_text(file):
     text = ""
-    if file.name.endswith(".pdf"):
-        pdf_reader = PdfReader(file)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    elif file.name.endswith(".docx"):
-        text = docx2txt.process(file)
-    elif file.name.endswith(".txt"):
-        text = file.read().decode("utf-8")
-    elif file.name.endswith(".csv"):
-        df = pd.read_csv(file)
-        text = df.to_string()
-    elif file.name.endswith(".xlsx"):
-        df = pd.read_excel(file)
-        text = df.to_string()
-    elif file.name.endswith(".json"):
-        data = json.load(file)
-        text = json.dumps(data, indent=4)
     return text
 # Step 5: Function to convert text into chunks

 # Step 4: Function to read files and extract text
 def extract_text(file):
     text = ""
+    try:
+        if file.name.endswith(".pdf"):
+            pdf_reader = PdfReader(file)
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+        elif file.name.endswith(".docx"):
+            text = docx2txt.process(file)
+        elif file.name.endswith(".txt"):
+            text = file.read().decode("utf-8")  # Assuming UTF-8 by default
+        elif file.name.endswith(".csv"):
+            df = pd.read_csv(file, encoding='utf-8')  # Assuming UTF-8 by default
+            text = df.to_string()
+        elif file.name.endswith(".xlsx"):
+            df = pd.read_excel(file)
+            text = df.to_string()
+        elif file.name.endswith(".json"):
+            data = json.load(file)
+            text = json.dumps(data, indent=4)
+    except UnicodeDecodeError:
+        # Handle the error by trying a different encoding
+        file.seek(0)  # Reset the file pointer
+        if file.name.endswith(".txt"):
+            text = file.read().decode("ISO-8859-1")  # Try Latin-1 encoding
+        elif file.name.endswith(".csv"):
+            df = pd.read_csv(file, encoding='ISO-8859-1')  # Try Latin-1 encoding
+            text = df.to_string()
     return text
 # Step 5: Function to convert text into chunks