File size: 2,074 Bytes
5f07ce9
 
 
47125e3
0b87fda
5f07ce9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd8442d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import gradio as gr
from PyPDF2 import PdfReader
import zipfile
import os
import io
import nltk
import openai

# Put your OpenAI API key here
openai.api_key = os.getenv('OpenAPI') 

def pdf_to_text(file, user_prompt):
    z = zipfile.ZipFile(file.name, 'r')
    texts = []
    for filename in z.namelist():
        if filename.endswith('.pdf'):
            pdf_file_data = z.read(filename)
            pdf_file_io = io.BytesIO(pdf_file_data)
            pdf = PdfReader(pdf_file_io)
            text = ''
            for page in pdf.pages:
                text += page.extract_text()
            # Tokenize text
            tokens = nltk.word_tokenize(text)
            # If tokens are more than 2000, split into chunks
            if len(tokens) > 2000:
                for i in range(0, len(tokens), 2000):
                    chunk = tokens[i:i + 2000]
                    chunk_str = ' '.join(chunk)
                    # Using OpenAI API
                    response = openai.ChatCompletion.create(
                      model="gpt-3.5-turbo",
                      messages=[
                            {"role": "system", "content": "You are a helpful assistant."},
                            {"role": "user", "content": user_prompt},
                            {"role": "user", "content": chunk_str},
                      ]
                    )
                    texts.append(response['choices'][0]['message']['content'])
            else:
                # Using OpenAI API
                response = openai.ChatCompletion.create(
                  model="gpt-3.5-turbo",
                  messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": user_prompt},
                        {"role": "user", "content": text},
                  ]
                )
                texts.append(response['choices'][0]['message']['content'])
    return '\n'.join(texts)

iface = gr.Interface(fn=pdf_to_text, inputs=["file", "text"], outputs="text")
iface.launch(share=False)