File size: 2,795 Bytes
5f07ce9
 
 
47125e3
0b87fda
5f07ce9
 
dc5e316
5f07ce9
5672db4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f07ce9
 
 
dc5e316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f07ce9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc5e316
 
5f07ce9
 
dc5e316
 
5f07ce9
 
dc5e316
 
 
593d86e
 
dc5e316
593d86e
dc5e316
593d86e
dc5e316
fd8442d
dc5e316
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
from PyPDF2 import PdfReader
import zipfile
import os
import io
import nltk
import openai
import time


import pip
import subprocess
import sys 

# install required libraries
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])

# download required NLTK data packages
nltk.download('punkt')
nltk.download('all')  # or any other packages your project depends on








# Put your OpenAI API key here
openai.api_key = os.getenv('OpenAPI') 

def call_openai_api(prompt):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ]
            )
            return response['choices'][0]['message']['content']
        except Exception as e:
            if attempt < max_retries - 1:  # if it's not the last attempt
                time.sleep(1)  # wait for 1 seconds before retrying
                continue
            else:
                return str(e)  # return the exception message after the last attempt

def pdf_to_text(file, user_prompt):
    z = zipfile.ZipFile(file.name, 'r')
    texts = []
    for filename in z.namelist():
        if filename.endswith('.pdf'):
            pdf_file_data = z.read(filename)
            pdf_file_io = io.BytesIO(pdf_file_data)
            pdf = PdfReader(pdf_file_io)
            text = ''
            for page in pdf.pages:
                text += page.extract_text()
            # Tokenize text
            tokens = nltk.word_tokenize(text)
            # If tokens are more than 2000, split into chunks
            if len(tokens) > 2000:
                for i in range(0, len(tokens), 2000):
                    chunk = tokens[i:i + 2000]
                    chunk_str = ' '.join(chunk)
                    # Using OpenAI API
                    response = call_openai_api(chunk_str)
                    texts.append(response)
            else:
                # Using OpenAI API
                response = call_openai_api(text)
                texts.append(response)
    return '\n'.join(texts)

iface = gr.Interface(
    fn=pdf_to_text, 
    inputs=[
        gr.inputs.File(label="PDF File (Upload a Zip file containing ONLY PDF files)"),
        gr.inputs.Textbox(label="User Prompt (Enter a prompt to guide the AI's responses)")
    ], 
    outputs=gr.outputs.Textbox(label="Cognitive Agent Response"),
    title="PDF Text Extractor",
    description="This app extracts knowledge from the uploaded Zip files. Using a Cognitive Agent you can interact with that knowledge."
)
iface.launch(share=False)