Engr-Saeed commited on
Commit
6abe79a
·
verified ·
1 Parent(s): d76f892

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -23,23 +23,32 @@ genai.configure(api_key=api_key)
23
  # Step 4: Function to read files and extract text
24
  def extract_text(file):
25
  text = ""
26
- if file.name.endswith(".pdf"):
27
- pdf_reader = PdfReader(file)
28
- for page in pdf_reader.pages:
29
- text += page.extract_text()
30
- elif file.name.endswith(".docx"):
31
- text = docx2txt.process(file)
32
- elif file.name.endswith(".txt"):
33
- text = file.read().decode("utf-8")
34
- elif file.name.endswith(".csv"):
35
- df = pd.read_csv(file)
36
- text = df.to_string()
37
- elif file.name.endswith(".xlsx"):
38
- df = pd.read_excel(file)
39
- text = df.to_string()
40
- elif file.name.endswith(".json"):
41
- data = json.load(file)
42
- text = json.dumps(data, indent=4)
 
 
 
 
 
 
 
 
 
43
  return text
44
 
45
  # Step 5: Function to convert text into chunks
 
23
  # Step 4: Function to read files and extract text
24
  def extract_text(file):
25
  text = ""
26
+ try:
27
+ if file.name.endswith(".pdf"):
28
+ pdf_reader = PdfReader(file)
29
+ for page in pdf_reader.pages:
30
+ text += page.extract_text()
31
+ elif file.name.endswith(".docx"):
32
+ text = docx2txt.process(file)
33
+ elif file.name.endswith(".txt"):
34
+ text = file.read().decode("utf-8") # Assuming UTF-8 by default
35
+ elif file.name.endswith(".csv"):
36
+ df = pd.read_csv(file, encoding='utf-8') # Assuming UTF-8 by default
37
+ text = df.to_string()
38
+ elif file.name.endswith(".xlsx"):
39
+ df = pd.read_excel(file)
40
+ text = df.to_string()
41
+ elif file.name.endswith(".json"):
42
+ data = json.load(file)
43
+ text = json.dumps(data, indent=4)
44
+ except UnicodeDecodeError:
45
+ # Handle the error by trying a different encoding
46
+ file.seek(0) # Reset the file pointer
47
+ if file.name.endswith(".txt"):
48
+ text = file.read().decode("ISO-8859-1") # Try Latin-1 encoding
49
+ elif file.name.endswith(".csv"):
50
+ df = pd.read_csv(file, encoding='ISO-8859-1') # Try Latin-1 encoding
51
+ text = df.to_string()
52
  return text
53
 
54
  # Step 5: Function to convert text into chunks