Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PyPDF2 import PdfReader | |
import zipfile | |
import os | |
import io | |
import nltk | |
import openai | |
# Put your OpenAI API key here | |
openai.api_key = os.getenv('OpenAPI') | |
def pdf_to_text(file, user_prompt): | |
z = zipfile.ZipFile(file.name, 'r') | |
texts = [] | |
for filename in z.namelist(): | |
if filename.endswith('.pdf'): | |
pdf_file_data = z.read(filename) | |
pdf_file_io = io.BytesIO(pdf_file_data) | |
pdf = PdfReader(pdf_file_io) | |
text = '' | |
for page in pdf.pages: | |
text += page.extract_text() | |
# Tokenize text | |
tokens = nltk.word_tokenize(text) | |
# If tokens are more than 2000, split into chunks | |
if len(tokens) > 2000: | |
for i in range(0, len(tokens), 2000): | |
chunk = tokens[i:i + 2000] | |
chunk_str = ' '.join(chunk) | |
# Using OpenAI API | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": user_prompt}, | |
{"role": "user", "content": chunk_str}, | |
] | |
) | |
texts.append(response['choices'][0]['message']['content']) | |
else: | |
# Using OpenAI API | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": user_prompt}, | |
{"role": "user", "content": text}, | |
] | |
) | |
texts.append(response['choices'][0]['message']['content']) | |
return '\n'.join(texts) | |
iface = gr.Interface(fn=pdf_to_text, inputs=["file", "text"], outputs="text") | |
iface.launch(share=False) | |