pratikshahp commited on
Commit
c29df11
·
verified ·
1 Parent(s): 7d2ccd7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ from langchain.vectorstores import Chroma
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ import os
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Initialize the model and tokenizer
15
+ model_name = "openai-community/gpt2"
16
+ # model_name = "google/gemma-2-9b"
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ model = AutoModelForCausalLM.from_pretrained(model_name) # , use_auth_token=hf_api_key
19
+
20
+ def get_llm_response(input_prompt, content, prompt):
21
+ combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
22
+ inputs = tokenizer(combined_input, return_tensors="pt")
23
+ outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
24
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
25
+
26
+ # Extract the answer part from the response
27
+ answer_start = response.find("Answer:") + len("Answer:")
28
+ answer = response[answer_start:].strip()
29
+
30
+ return answer
31
+
32
+ # Function to extract text from PDF file
33
+ def extract_text_from_pdf(file):
34
+ try:
35
+ doc = fitz.open(stream=file.read(), filetype="pdf")
36
+ text = ""
37
+ for page in doc:
38
+ text += page.get_text()
39
+ return text
40
+ except Exception as e:
41
+ return f"Error occurred while reading PDF file: {e}"
42
+
43
+ def process_pdf_and_answer_question(pdf_file, question):
44
+ # Extract text from uploaded PDF file
45
+ pdf_text = extract_text_from_pdf(pdf_file)
46
+
47
+ if not pdf_text or "Error occurred" in pdf_text:
48
+ return pdf_text
49
+
50
+ try:
51
+ # Create embeddings
52
+ embeddings = HuggingFaceEmbeddings()
53
+
54
+ # Split text into chunks
55
+ text_splitter = RecursiveCharacterTextSplitter(
56
+ chunk_size=1000,
57
+ chunk_overlap=20,
58
+ length_function=len,
59
+ is_separator_regex=False,
60
+ )
61
+ chunks = text_splitter.create_documents([pdf_text])
62
+
63
+ # Store chunks in ChromaDB
64
+ persist_directory = 'pdf_embeddings'
65
+ vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
66
+ vectordb.persist() # Persist ChromaDB
67
+
68
+ # Load persisted Chroma database
69
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
70
+
71
+ # Perform question answering
72
+ if question:
73
+ docs = vectordb.similarity_search(question)
74
+ text = docs[0].page_content
75
+ input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
76
+ response = get_llm_response(input_prompt, text, question)
77
+ return response
78
+ else:
79
+ return "Please provide a valid question."
80
+ except Exception as e:
81
+ return f"Error occurred during text processing: {e}"
82
+
83
+ # Create Gradio interface
84
+ iface = gr.Interface(
85
+ fn=process_pdf_and_answer_question,
86
+ inputs=[gr.inputs.File(type="file", label="Upload PDF File"), gr.inputs.Textbox(lines=2, placeholder="Ask a Question")],
87
+ outputs="text",
88
+ title="PDF Chatbot",
89
+ description="Upload a PDF file and ask questions about its content."
90
+ )
91
+
92
+ if __name__ == "__main__":
93
+ iface.launch()