import os import zipfile import openai import gradio as gr from gradio import components as grc # Set up OpenAI API credentials openai.api_key = "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" # Function to extract text from PDF using OpenAI API def extract_text_from_pdf(pdf_path): with open(pdf_path, "rb") as f: pdf_bytes = f.read() response = openai.Completion.create( engine="text-davinci-003", prompt=pdf_bytes.decode("utf-8"), max_tokens=2048, temperature=0.7, n=1, stop=None, timeout=120, ) return response.choices[0].text.strip() # Function to extract text from multiple PDFs in a ZIP archive def extract_text_from_zip(zip_file): corpus = "" with zipfile.ZipFile(zip_file, "r") as zip_ref: for file_name in zip_ref.namelist(): if file_name.endswith(".pdf"): extracted_text = extract_text_from_pdf(zip_ref.read(file_name)) corpus += extracted_text + "\n" return corpus # Function to split text into chunks based on maximum token length def split_text_into_chunks(text, max_tokens=2048): chunks = [] words = text.split() current_chunk = "" for word in words: if len(current_chunk) + len(word) <= max_tokens: current_chunk += word + " " else: chunks.append(current_chunk.strip()) current_chunk = word + " " if current_chunk: chunks.append(current_chunk.strip()) return chunks # Function to process files and query using OpenAI API def process_files_and_query(zip_file, query): # Save uploaded ZIP file zip_path = "uploaded.zip" with open(zip_path, "wb") as f: f.write(zip_file.read()) # Extract text from PDFs in the ZIP archive corpus = extract_text_from_zip(zip_file) # Split the corpus into chunks chunks = split_text_into_chunks(corpus) # Perform OpenAI API query on each chunk responses = [] for chunk in chunks: prompt = chunk + "\nQuery: " + query response = openai.Completion.create( engine="text-davinci-003", prompt=prompt, max_tokens=2048, temperature=0.7, n=1, stop=None, timeout=120, ) responses.append(response.choices[0].text.strip()) # Combine the responses into a single answer answer = " ".join(responses) return answer # Gradio input and output interfaces zip_file_input = grc.File(label="Upload ZIP File") query_input = grc.Textbox(label="Enter your query") output = grc.Textbox(label="Answer") # Gradio interface configuration iface = gr.Interface(fn=process_files_and_query, inputs=[zip_file_input, query_input], outputs=output, title="PDF Search", description="Upload a ZIP file containing PDFs, enter your query, and get the answer.") iface.launch()