mgokg commited on
Commit
3b67edb
·
verified ·
1 Parent(s): d97712c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -57
app.py CHANGED
@@ -1,36 +1,34 @@
1
  import gradio as gr
 
2
  import fitz # PyMuPDF
3
- import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- from langchain_community.vectorstores import Chroma
7
- from langchain_community.embeddings import HuggingFaceEmbeddings
8
- from langchain_text_splitters import RecursiveCharacterTextSplitter
9
- import os
 
 
 
 
 
 
 
10
  #from dotenv import load_dotenv
11
 
12
  # Load environment variables
13
  #load_dotenv()
14
  # hf_api_key = os.getenv("HF_TOKEN")
15
- model_name = "openai-community/gpt2"
16
  # model_name = "google/gemma-2-9b"
17
 
18
- tokenizer = AutoTokenizer.from_pretrained(model_name)
19
- model = AutoModelForCausalLM.from_pretrained(model_name) # ,use_auth_token=hf_api_key)
 
 
20
 
21
 
22
- def get_llm_response(input_prompt, content, prompt):
23
- combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
24
- inputs = tokenizer(combined_input, return_tensors="pt")
25
- outputs = model.generate(**inputs, max_length=1000, num_return_sequences=1)
26
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
27
-
28
- # Extract the answer part from the response
29
- answer_start = response.find("Answer:") + len("Answer:")
30
- answer = response[answer_start:].strip()
31
-
32
- return answer
33
-
34
 
35
  # Function to extract text from PDF file
36
  def extract_text_from_pdf(file_path):
@@ -48,45 +46,16 @@ def process_pdf(uploaded_file, prompt):
48
  if uploaded_file is not None:
49
  # Extract text from uploaded PDF file
50
  pdf_text = extract_text_from_pdf(uploaded_file.name)
 
51
  if pdf_text:
52
  try:
53
  # Create embeddings
54
- embeddings = HuggingFaceEmbeddings()
55
-
56
- # Split text into chunks
57
- text_splitter = RecursiveCharacterTextSplitter(
58
- chunk_size=1000,
59
- chunk_overlap=20,
60
- length_function=len,
61
- is_separator_regex=False,
62
- )
63
- chunks = text_splitter.create_documents([pdf_text])
64
-
65
- # Store chunks in ChromaDB
66
- persist_directory = 'pdf_embeddings'
67
- vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings,
68
- persist_directory=persist_directory)
69
- vectordb.persist() # Persist ChromaDB
70
-
71
- # Load persisted Chroma database
72
- vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
73
-
74
- # Perform question answering
75
- if prompt:
76
- docs = vectordb.similarity_search(prompt)
77
- if docs:
78
- text = docs[0].page_content
79
- input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
80
- response = get_llm_response(input_prompt, text, prompt)
81
- return response
82
- else:
83
- return "No relevant documents found."
84
- else:
85
- return "Please enter a question."
86
- except Exception as e:
87
- return f"Error occurred during text processing: {e}"
88
- else:
89
- return "Please upload a PDF file."
90
 
91
 
92
  def main():
 
1
  import gradio as gr
2
+ import chromadb
3
  import fitz # PyMuPDF
4
+ #import torch
5
+ import time
6
 
7
+ # Aktuellen Timestamp erstellen
8
+
9
+
10
+
11
+
12
+ #from transformers import AutoTokenizer, AutoModelForCausalLM
13
+
14
+ #from langchain_community.vectorstores import Chroma
15
+ #from langchain_community.embeddings import HuggingFaceEmbeddings
16
+ #from langchain_text_splitters import RecursiveCharacterTextSplitter
17
+ #import os
18
  #from dotenv import load_dotenv
19
 
20
  # Load environment variables
21
  #load_dotenv()
22
  # hf_api_key = os.getenv("HF_TOKEN")
23
+ #model_name = "openai-community/gpt2"
24
  # model_name = "google/gemma-2-9b"
25
 
26
+ #tokenizer = AutoTokenizer.from_pretrained(model_name)
27
+ #model = AutoModelForCausalLM.from_pretrained(model_name) # ,use_auth_token=hf_api_key)
28
+ client = chromadb.PersistentClient(path="/pdf_embeddings")
29
+ collection = client.get_or_create_collection(name="code")
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Function to extract text from PDF file
34
  def extract_text_from_pdf(file_path):
 
46
  if uploaded_file is not None:
47
  # Extract text from uploaded PDF file
48
  pdf_text = extract_text_from_pdf(uploaded_file.name)
49
+ timestamp = time.time()
50
  if pdf_text:
51
  try:
52
  # Create embeddings
53
+ collection.add(
54
+ documents=[pdf_text],
55
+ ids=[timestamp]
56
+ )
57
+
58
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  def main():