pratikshahp commited on
Commit
d189514
·
verified ·
1 Parent(s): d46d62b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -53
app.py CHANGED
@@ -5,30 +5,33 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
5
  from langchain.vectorstores import Chroma
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- import os
9
  from dotenv import load_dotenv
10
 
11
  # Load environment variables
12
  load_dotenv()
 
13
 
14
- # Initialize the model and tokenizer
15
  model_name = "openai-community/gpt2"
16
  # model_name = "google/gemma-2-9b"
 
17
  tokenizer = AutoTokenizer.from_pretrained(model_name)
18
- model = AutoModelForCausalLM.from_pretrained(model_name) # , use_auth_token=hf_api_key
 
19
 
20
  def get_llm_response(input_prompt, content, prompt):
21
  combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
22
  inputs = tokenizer(combined_input, return_tensors="pt")
23
  outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
24
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
25
-
26
  # Extract the answer part from the response
27
  answer_start = response.find("Answer:") + len("Answer:")
28
  answer = response[answer_start:].strip()
29
-
30
  return answer
31
 
 
32
  # Function to extract text from PDF file
33
  def extract_text_from_pdf(file):
34
  try:
@@ -40,54 +43,62 @@ def extract_text_from_pdf(file):
40
  except Exception as e:
41
  return f"Error occurred while reading PDF file: {e}"
42
 
43
- def process_pdf_and_answer_question(pdf_file, question):
44
- # Extract text from uploaded PDF file
45
- pdf_text = extract_text_from_pdf(pdf_file)
46
-
47
- if not pdf_text or "Error occurred" in pdf_text:
48
- return pdf_text
49
-
50
- try:
51
- # Create embeddings
52
- embeddings = HuggingFaceEmbeddings()
53
-
54
- # Split text into chunks
55
- text_splitter = RecursiveCharacterTextSplitter(
56
- chunk_size=1000,
57
- chunk_overlap=20,
58
- length_function=len,
59
- is_separator_regex=False,
60
- )
61
- chunks = text_splitter.create_documents([pdf_text])
62
-
63
- # Store chunks in ChromaDB
64
- persist_directory = 'pdf_embeddings'
65
- vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
66
- vectordb.persist() # Persist ChromaDB
67
-
68
- # Load persisted Chroma database
69
- vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
70
-
71
- # Perform question answering
72
- if question:
73
- docs = vectordb.similarity_search(question)
74
- text = docs[0].page_content
75
- input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
76
- response = get_llm_response(input_prompt, text, question)
77
- return response
78
- else:
79
- return "Please provide a valid question."
80
- except Exception as e:
81
- return f"Error occurred during text processing: {e}"
82
 
83
- # Create Gradio interface
84
- iface = gr.Interface(
85
- fn=process_pdf_and_answer_question,
86
- inputs=[gr.inputs.File(type="file", label="Upload PDF File"), gr.inputs.Textbox(lines=2, placeholder="Ask a Question")],
87
- outputs="text",
88
- title="PDF Chatbot",
89
- description="Upload a PDF file and ask questions about its content."
90
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  if __name__ == "__main__":
93
- iface.launch()
 
5
  from langchain.vectorstores import Chroma
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ import os
9
  from dotenv import load_dotenv
10
 
11
  # Load environment variables
12
  load_dotenv()
13
+ # hf_api_key = os.getenv("HF_TOKEN")
14
 
 
15
  model_name = "openai-community/gpt2"
16
  # model_name = "google/gemma-2-9b"
17
+
18
  tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ model = AutoModelForCausalLM.from_pretrained(model_name) # ,use_auth_token=hf_api_key)
20
+
21
 
22
  def get_llm_response(input_prompt, content, prompt):
23
  combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
24
  inputs = tokenizer(combined_input, return_tensors="pt")
25
  outputs = model.generate(**inputs, max_length=400, num_return_sequences=1)
26
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
27
+
28
  # Extract the answer part from the response
29
  answer_start = response.find("Answer:") + len("Answer:")
30
  answer = response[answer_start:].strip()
31
+
32
  return answer
33
 
34
+
35
  # Function to extract text from PDF file
36
  def extract_text_from_pdf(file):
37
  try:
 
43
  except Exception as e:
44
  return f"Error occurred while reading PDF file: {e}"
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ def process_pdf(uploaded_file, prompt):
48
+ if uploaded_file is not None:
49
+ # Extract text from uploaded PDF file
50
+ pdf_text = extract_text_from_pdf(uploaded_file)
51
+ if pdf_text:
52
+ try:
53
+ # Create embeddings
54
+ embeddings = HuggingFaceEmbeddings()
55
+
56
+ # Split text into chunks
57
+ text_splitter = RecursiveCharacterTextSplitter(
58
+ chunk_size=1000,
59
+ chunk_overlap=20,
60
+ length_function=len,
61
+ is_separator_regex=False,
62
+ )
63
+ chunks = text_splitter.create_documents([pdf_text])
64
+
65
+ # Store chunks in ChromaDB
66
+ persist_directory = 'pdf_embeddings'
67
+ vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings,
68
+ persist_directory=persist_directory)
69
+ vectordb.persist() # Persist ChromaDB
70
+
71
+ # Load persisted Chroma database
72
+ vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
73
+
74
+ # Perform question answering
75
+ if prompt:
76
+ docs = vectordb.similarity_search(prompt)
77
+ if docs:
78
+ text = docs[0].page_content
79
+ input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
80
+ response = get_llm_response(input_prompt, text, prompt)
81
+ return response
82
+ else:
83
+ return "No relevant documents found."
84
+ else:
85
+ return "Please enter a question."
86
+ except Exception as e:
87
+ return f"Error occurred during text processing: {e}"
88
+ else:
89
+ return "Please upload a PDF file."
90
+
91
+
92
+ def main():
93
+ gr.Interface(
94
+ fn=process_pdf,
95
+ inputs=[gr.components.File(type="file", label="Upload PDF File"),
96
+ gr.components.Textbox(lines=2, placeholder="Ask a Question")],
97
+ outputs="text",
98
+ title="PDF Chatbot",
99
+ description="Upload a PDF file and ask questions about its content."
100
+ ).launch()
101
+
102
 
103
  if __name__ == "__main__":
104
+ main()