Spaces:

cogcorp
/

assignment1

Sleeping

File size: 2,795 Bytes

5f07ce9
 
 
47125e3
0b87fda
5f07ce9
 
dc5e316
5f07ce9
5672db4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f07ce9
 
 
dc5e316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f07ce9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc5e316
 
5f07ce9
 
dc5e316
 
5f07ce9
 
dc5e316
 
 
593d86e
 
dc5e316
593d86e
dc5e316
593d86e
dc5e316
fd8442d
dc5e316

import gradio as gr
from PyPDF2 import PdfReader
import zipfile
import os
import io
import nltk
import openai
import time


import pip
import subprocess
import sys 

# install required libraries
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])

# download required NLTK data packages
nltk.download('punkt')
nltk.download('all')  # or any other packages your project depends on








# Put your OpenAI API key here
openai.api_key = os.getenv('OpenAPI') 

def call_openai_api(prompt):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ]
            )
            return response['choices'][0]['message']['content']
        except Exception as e:
            if attempt < max_retries - 1:  # if it's not the last attempt
                time.sleep(1)  # wait for 1 seconds before retrying
                continue
            else:
                return str(e)  # return the exception message after the last attempt

def pdf_to_text(file, user_prompt):
    z = zipfile.ZipFile(file.name, 'r')
    texts = []
    for filename in z.namelist():
        if filename.endswith('.pdf'):
            pdf_file_data = z.read(filename)
            pdf_file_io = io.BytesIO(pdf_file_data)
            pdf = PdfReader(pdf_file_io)
            text = ''
            for page in pdf.pages:
                text += page.extract_text()
            # Tokenize text
            tokens = nltk.word_tokenize(text)
            # If tokens are more than 2000, split into chunks
            if len(tokens) > 2000:
                for i in range(0, len(tokens), 2000):
                    chunk = tokens[i:i + 2000]
                    chunk_str = ' '.join(chunk)
                    # Using OpenAI API
                    response = call_openai_api(chunk_str)
                    texts.append(response)
            else:
                # Using OpenAI API
                response = call_openai_api(text)
                texts.append(response)
    return '\n'.join(texts)

iface = gr.Interface(
    fn=pdf_to_text, 
    inputs=[
        gr.inputs.File(label="PDF File (Upload a Zip file containing ONLY PDF files)"),
        gr.inputs.Textbox(label="User Prompt (Enter a prompt to guide the AI's responses)")
    ], 
    outputs=gr.outputs.Textbox(label="Cognitive Agent Response"),
    title="PDF Text Extractor",
    description="This app extracts knowledge from the uploaded Zip files. Using a Cognitive Agent you can interact with that knowledge."
)
iface.launch(share=False)