cogcorp commited on
Commit
0b87fda
·
1 Parent(s): 10589bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -81
app.py CHANGED
@@ -1,94 +1,39 @@
1
  import os
 
2
  import zipfile
3
- import openai
 
4
  import gradio as gr
5
- from gradio import components as grc
6
 
7
- # Set up OpenAI API credentials
8
- openai.api_key = "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"
 
9
 
10
- # Function to extract text from PDF using OpenAI API
11
- def extract_text_from_pdf(pdf_path):
12
- with open(pdf_path, "rb") as f:
13
- pdf_bytes = f.read()
14
- response = openai.Completion.create(
15
- engine="text-davinci-003",
16
- prompt=pdf_bytes.decode("utf-8"),
17
- max_tokens=2048,
18
- temperature=0.7,
19
- n=1,
20
- stop=None,
21
- timeout=120,
22
- )
23
- return response.choices[0].text.strip()
24
 
25
- # Function to extract text from multiple PDFs in a ZIP archive
26
- def extract_text_from_zip(zip_file):
27
- corpus = ""
28
- with zipfile.ZipFile(zip_file, "r") as zip_ref:
29
- for file_name in zip_ref.namelist():
30
- if file_name.endswith(".pdf"):
31
- extracted_text = extract_text_from_pdf(zip_ref.read(file_name))
32
- corpus += extracted_text + "\n"
33
- return corpus
34
-
35
- # Function to split text into chunks based on maximum token length
36
- def split_text_into_chunks(text, max_tokens=2048):
37
- chunks = []
38
- words = text.split()
39
- current_chunk = ""
40
- for word in words:
41
- if len(current_chunk) + len(word) <= max_tokens:
42
- current_chunk += word + " "
43
- else:
44
- chunks.append(current_chunk.strip())
45
- current_chunk = word + " "
46
- if current_chunk:
47
- chunks.append(current_chunk.strip())
48
- return chunks
49
-
50
- # Function to process files and query using OpenAI API
51
- def process_files_and_query(zip_file, query):
52
- # Save uploaded ZIP file
53
- zip_path = "uploaded.zip"
54
- with open(zip_path, "wb") as f:
55
- f.write(zip_file.read())
56
 
57
- # Extract text from PDFs in the ZIP archive
58
- corpus = extract_text_from_zip(zip_file)
59
-
60
- # Split the corpus into chunks
61
- chunks = split_text_into_chunks(corpus)
62
 
63
- # Perform OpenAI API query on each chunk
64
- responses = []
65
- for chunk in chunks:
66
- prompt = chunk + "\nQuery: " + query
67
- response = openai.Completion.create(
68
- engine="text-davinci-003",
69
- prompt=prompt,
70
- max_tokens=2048,
71
- temperature=0.7,
72
- n=1,
73
- stop=None,
74
- timeout=120,
75
- )
76
- responses.append(response.choices[0].text.strip())
77
 
78
- # Combine the responses into a single answer
79
- answer = " ".join(responses)
 
 
80
 
81
- return answer
 
82
 
83
- # Gradio input and output interfaces
84
- zip_file_input = grc.File(label="Upload ZIP File")
85
- query_input = grc.Textbox(label="Enter your query")
86
- output = grc.Textbox(label="Answer")
87
 
88
- # Gradio interface configuration
89
- iface = gr.Interface(fn=process_files_and_query, inputs=[zip_file_input, query_input], outputs=output, title="PDF Search", description="Upload a ZIP file containing PDFs, enter your query, and get the answer.")
 
 
 
90
  iface.launch()
91
-
92
-
93
-
94
-
 
1
  import os
2
+ import io
3
  import zipfile
4
+ from pdf2image import convert_from_path
5
+ import easyocr
6
  import gradio as gr
 
7
 
8
+ def convert_pdf_to_text(input_zip):
9
+ if not input_zip.name.endswith(".zip"):
10
+ return "Please upload a .zip file."
11
 
12
+ text_contents = ''
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ reader = easyocr.Reader(['en']) # Specify the language(s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ with zipfile.ZipFile(input_zip.name, 'r') as zip_ref:
17
+ for file_name in zip_ref.namelist():
18
+ if file_name.endswith('.pdf'):
19
+ pdf_file_path = zip_ref.extract(file_name)
 
20
 
21
+ # Convert PDF to a list of images
22
+ images = convert_from_path(pdf_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Iterate through each image and perform OCR using easyocr
25
+ for image in images:
26
+ result = reader.readtext(image, detail=0) # detail=0 for only the OCR'd text
27
+ text_contents += ' '.join(result)
28
 
29
+ # Clean up the extracted pdf file
30
+ os.remove(pdf_file_path)
31
 
32
+ return text_contents
 
 
 
33
 
34
+ iface = gr.Interface(
35
+ fn=convert_pdf_to_text,
36
+ inputs=gr.inputs.File(),
37
+ outputs="text"
38
+ )
39
  iface.launch()