cogcorp commited on
Commit
5f07ce9
·
1 Parent(s): 0b87fda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -37
app.py CHANGED
@@ -1,39 +1,54 @@
 
 
 
1
  import os
2
  import io
3
- import zipfile
4
- from pdf2image import convert_from_path
5
- import easyocr
6
- import gradio as gr
7
-
8
- def convert_pdf_to_text(input_zip):
9
- if not input_zip.name.endswith(".zip"):
10
- return "Please upload a .zip file."
11
-
12
- text_contents = ''
13
-
14
- reader = easyocr.Reader(['en']) # Specify the language(s)
15
-
16
- with zipfile.ZipFile(input_zip.name, 'r') as zip_ref:
17
- for file_name in zip_ref.namelist():
18
- if file_name.endswith('.pdf'):
19
- pdf_file_path = zip_ref.extract(file_name)
20
-
21
- # Convert PDF to a list of images
22
- images = convert_from_path(pdf_file_path)
23
-
24
- # Iterate through each image and perform OCR using easyocr
25
- for image in images:
26
- result = reader.readtext(image, detail=0) # detail=0 for only the OCR'd text
27
- text_contents += ' '.join(result)
28
-
29
- # Clean up the extracted pdf file
30
- os.remove(pdf_file_path)
31
-
32
- return text_contents
33
-
34
- iface = gr.Interface(
35
- fn=convert_pdf_to_text,
36
- inputs=gr.inputs.File(),
37
- outputs="text"
38
- )
39
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ import zipfile
4
  import os
5
  import io
6
+ import nltk
7
+ import openai
8
+
9
+ # Put your OpenAI API key here
10
+ openai.api_key = os.getenv('OpenAPI')
11
+
12
+ def pdf_to_text(file, user_prompt):
13
+ z = zipfile.ZipFile(file.name, 'r')
14
+ texts = []
15
+ for filename in z.namelist():
16
+ if filename.endswith('.pdf'):
17
+ pdf_file_data = z.read(filename)
18
+ pdf_file_io = io.BytesIO(pdf_file_data)
19
+ pdf = PdfReader(pdf_file_io)
20
+ text = ''
21
+ for page in pdf.pages:
22
+ text += page.extract_text()
23
+ # Tokenize text
24
+ tokens = nltk.word_tokenize(text)
25
+ # If tokens are more than 2000, split into chunks
26
+ if len(tokens) > 2000:
27
+ for i in range(0, len(tokens), 2000):
28
+ chunk = tokens[i:i + 2000]
29
+ chunk_str = ' '.join(chunk)
30
+ # Using OpenAI API
31
+ response = openai.ChatCompletion.create(
32
+ model="gpt-3.5-turbo",
33
+ messages=[
34
+ {"role": "system", "content": "You are a helpful assistant."},
35
+ {"role": "user", "content": user_prompt},
36
+ {"role": "user", "content": chunk_str},
37
+ ]
38
+ )
39
+ texts.append(response['choices'][0]['message']['content'])
40
+ else:
41
+ # Using OpenAI API
42
+ response = openai.ChatCompletion.create(
43
+ model="gpt-3.5-turbo",
44
+ messages=[
45
+ {"role": "system", "content": "You are a helpful assistant."},
46
+ {"role": "user", "content": user_prompt},
47
+ {"role": "user", "content": text},
48
+ ]
49
+ )
50
+ texts.append(response['choices'][0]['message']['content'])
51
+ return '\n'.join(texts)
52
+
53
+ iface = gr.Interface(fn=pdf_to_text, inputs=["file", "text"], outputs="text")
54
+ iface.launch(share=True)