RAG-Vereine

Sleeping

App Files Files Community

mgokg commited on Nov 10, 2024

Commit

3b67edb

verified ·

1 Parent(s): d97712c

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -57

app.py CHANGED Viewed

@@ -1,36 +1,34 @@
 import gradio as gr
 import fitz  # PyMuPDF
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from langchain_community.vectorstores import Chroma
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-import os
 #from dotenv import load_dotenv
 # Load environment variables
 #load_dotenv()
 # hf_api_key = os.getenv("HF_TOKEN")
-model_name = "openai-community/gpt2"
 # model_name = "google/gemma-2-9b"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)  # ,use_auth_token=hf_api_key)
-def get_llm_response(input_prompt, content, prompt):
-    combined_input = f"{input_prompt}\nContent: {content}\nQuestion: {prompt}\nAnswer:"
-    inputs = tokenizer(combined_input, return_tensors="pt")
-    outputs = model.generate(**inputs, max_length=1000, num_return_sequences=1)
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract the answer part from the response
-    answer_start = response.find("Answer:") + len("Answer:")
-    answer = response[answer_start:].strip()
-    return answer
 # Function to extract text from PDF file
 def extract_text_from_pdf(file_path):
@@ -48,45 +46,16 @@ def process_pdf(uploaded_file, prompt):
     if uploaded_file is not None:
         # Extract text from uploaded PDF file
         pdf_text = extract_text_from_pdf(uploaded_file.name)
         if pdf_text:
             try:
                 # Create embeddings
-                embeddings = HuggingFaceEmbeddings()
-                # Split text into chunks
-                text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=1000,
-                    chunk_overlap=20,
-                    length_function=len,
-                    is_separator_regex=False,
-                )
-                chunks = text_splitter.create_documents([pdf_text])
-                # Store chunks in ChromaDB
-                persist_directory = 'pdf_embeddings'
-                vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings,
-                                                 persist_directory=persist_directory)
-                vectordb.persist()  # Persist ChromaDB
-                # Load persisted Chroma database
-                vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
-                # Perform question answering
-                if prompt:
-                    docs = vectordb.similarity_search(prompt)
-                    if docs:
-                        text = docs[0].page_content
-                        input_prompt = "You are an expert in understanding text contents. You will receive an input PDF file and you will have to answer questions based on the input file."
-                        response = get_llm_response(input_prompt, text, prompt)
-                        return response
-                    else:
-                        return "No relevant documents found."
-                else:
-                    return "Please enter a question."
-            except Exception as e:
-                return f"Error occurred during text processing: {e}"
-    else:
-        return "Please upload a PDF file."
 def main():

 import gradio as gr
+import chromadb
 import fitz  # PyMuPDF
+#import torch
+import time
+# Aktuellen Timestamp erstellen
+#from transformers import AutoTokenizer, AutoModelForCausalLM
+#from langchain_community.vectorstores import Chroma
+#from langchain_community.embeddings import HuggingFaceEmbeddings
+#from langchain_text_splitters import RecursiveCharacterTextSplitter
+#import os
 #from dotenv import load_dotenv
 # Load environment variables
 #load_dotenv()
 # hf_api_key = os.getenv("HF_TOKEN")
+#model_name = "openai-community/gpt2"
 # model_name = "google/gemma-2-9b"
+#tokenizer = AutoTokenizer.from_pretrained(model_name)
+#model = AutoModelForCausalLM.from_pretrained(model_name)  # ,use_auth_token=hf_api_key)
+client = chromadb.PersistentClient(path="/pdf_embeddings")
+collection = client.get_or_create_collection(name="code")
 # Function to extract text from PDF file
 def extract_text_from_pdf(file_path):
     if uploaded_file is not None:
         # Extract text from uploaded PDF file
         pdf_text = extract_text_from_pdf(uploaded_file.name)
+        timestamp = time.time()
         if pdf_text:
             try:
                 # Create embeddings
+                collection.add(
+                    documents=[pdf_text],
+                    ids=[timestamp]
+)
 def main():