Spaces:
Sleeping
Sleeping
import os | |
import zipfile | |
import openai | |
import gradio as gr | |
from gradio import components as grc | |
# Set up OpenAI API credentials | |
openai.api_key = "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" | |
# Function to extract text from PDF using OpenAI API | |
def extract_text_from_pdf(pdf_path): | |
with open(pdf_path, "rb") as f: | |
pdf_bytes = f.read() | |
response = openai.Completion.create( | |
engine="text-davinci-003", | |
prompt=pdf_bytes.decode("utf-8"), | |
max_tokens=2048, | |
temperature=0.7, | |
n=1, | |
stop=None, | |
timeout=120, | |
) | |
return response.choices[0].text.strip() | |
# Function to extract text from multiple PDFs in a ZIP archive | |
def extract_text_from_zip(zip_file): | |
corpus = "" | |
with zipfile.ZipFile(zip_file, "r") as zip_ref: | |
for file_name in zip_ref.namelist(): | |
if file_name.endswith(".pdf"): | |
extracted_text = extract_text_from_pdf(zip_ref.read(file_name)) | |
corpus += extracted_text + "\n" | |
return corpus | |
# Function to split text into chunks based on maximum token length | |
def split_text_into_chunks(text, max_tokens=2048): | |
chunks = [] | |
words = text.split() | |
current_chunk = "" | |
for word in words: | |
if len(current_chunk) + len(word) <= max_tokens: | |
current_chunk += word + " " | |
else: | |
chunks.append(current_chunk.strip()) | |
current_chunk = word + " " | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |
# Function to process files and query using OpenAI API | |
def process_files_and_query(zip_file, query): | |
# Save uploaded ZIP file | |
zip_path = "uploaded.zip" | |
with open(zip_path, "wb") as f: | |
f.write(zip_file.read()) | |
# Extract text from PDFs in the ZIP archive | |
corpus = extract_text_from_zip(zip_file) | |
# Split the corpus into chunks | |
chunks = split_text_into_chunks(corpus) | |
# Perform OpenAI API query on each chunk | |
responses = [] | |
for chunk in chunks: | |
prompt = chunk + "\nQuery: " + query | |
response = openai.Completion.create( | |
engine="text-davinci-003", | |
prompt=prompt, | |
max_tokens=2048, | |
temperature=0.7, | |
n=1, | |
stop=None, | |
timeout=120, | |
) | |
responses.append(response.choices[0].text.strip()) | |
# Combine the responses into a single answer | |
answer = " ".join(responses) | |
return answer | |
# Gradio input and output interfaces | |
zip_file_input = grc.File(label="Upload ZIP File") | |
query_input = grc.Textbox(label="Enter your query") | |
output = grc.Textbox(label="Answer") | |
# Gradio interface configuration | |
iface = gr.Interface(fn=process_files_and_query, inputs=[zip_file_input, query_input], outputs=output, title="PDF Search", description="Upload a ZIP file containing PDFs, enter your query, and get the answer.") | |
iface.launch() | |