Chris4K commited on
Commit
e8faa1f
·
verified ·
1 Parent(s): e96852d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -110
app.py CHANGED
@@ -1,117 +1,98 @@
1
  import os
2
  import gradio as gr
3
- from langchain.vectorstores.faiss import FAISS
4
- from langchain.embeddings import HuggingFaceBgeEmbeddings
5
- from langchain.document_loaders import PyPDFLoader
6
- from langchain.text_splitter import CharacterTextSplitter
7
- from PyPDF2 import PdfReader
8
-
9
- # Load environment variables
10
- #load_dotenv()
11
-
12
-
13
- # Print the current working directory
14
- current_directory = os.getcwd()
15
- print("Current Working Directory:", current_directory)
16
-
17
- def get_pdf_text(pdf_docs):
18
- """
19
- Extract text from a list of PDF documents.
20
-
21
- Parameters
22
- ----------
23
- pdf_docs : list
24
- List of PDF documents to extract text from.
25
-
26
- Returns
27
- -------
28
- str
29
- Extracted text from all the PDF documents.
30
-
31
- """
32
- text = ""
33
- #for pdf in pdf_docs:
34
- pdf_reader = PdfReader(pdf_docs)
35
- for page in pdf_reader.pages:
36
- text += page.extract_text()
37
- return text
38
-
39
-
40
- def get_text_chunks(text):
41
- """
42
- Split the input text into chunks.
43
-
44
- Parameters
45
- ----------
46
- text : str
47
- The input text to be split.
48
-
49
- Returns
50
- -------
51
- list
52
- List of text chunks.
53
-
54
- """
55
- text_splitter = CharacterTextSplitter(
56
- separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
57
- )
58
- chunks = text_splitter.split_text(text)
59
- return chunks
60
-
61
-
62
- def get_vectorstore(text_chunks):
63
- """
64
- Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
65
-
66
- Parameters
67
- ----------
68
- text_chunks : list
69
- List of text chunks to be embedded.
70
-
71
- Returns
72
- -------
73
- FAISS
74
- A FAISS vector store containing the embeddings of the text chunks.
75
 
76
- """
77
- model = "BAAI/bge-base-en-v1.5"
78
- encode_kwargs = {
79
- "normalize_embeddings": True
80
- } # set True to compute cosine similarity
81
- embeddings = HuggingFaceBgeEmbeddings(
82
- model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  )
84
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
85
- print("-----")
86
- print(vectorstore.similarity_search("What is ALiBi?"))
87
- print("-----")
88
- return vectorstore
89
 
90
- # Adjust the path to your PDF file by escaping the space
91
- pdf_path = r"new_papers/ALiBi.pdf"
92
- pdf_text = get_pdf_text(pdf_path)
93
 
94
- text_chunks = get_text_chunks(pdf_text)
95
- api_db = get_vectorstore(text_chunks)
 
 
96
 
97
-
98
-
99
- # Define the PDF retrieval function
100
- def pdf_retrieval(query):
101
- # Run the query through the retriever
102
- response = api_db.similarity_search(query)
103
- print(response)
104
- return response
105
-
106
- # Create Gradio interface for the API retriever
107
- api_tool = gr.Interface(
108
- fn=pdf_retrieval,
109
- inputs=[gr.Textbox()],
110
- outputs=gr.Textbox(),
111
- live=True,
112
- title="API PDF Retrieval Tool",
113
- description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
114
- )
115
-
116
- # Launch the Gradio interface
117
- api_tool.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
3
+ from rag_tool import RAGTool
4
+
5
+ # Initialize the RAG Tool with default settings
6
+ rag_tool = RAGTool(
7
+ documents_path="./documents",
8
+ embedding_model="sentence-transformers/all-MiniLM-L6-v2",
9
+ vector_store_type="faiss",
10
+ chunk_size=1000,
11
+ chunk_overlap=200,
12
+ persist_directory="./vector_store"
13
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Function to handle document uploads
16
+ def upload_documents(files, chunk_size, chunk_overlap, embedding_model, vector_store_type):
17
+ # Create a temporary directory for uploaded files
18
+ os.makedirs("./uploaded_docs", exist_ok=True)
19
+
20
+ # Save uploaded files
21
+ for file in files:
22
+ file_path = os.path.join("./uploaded_docs", os.path.basename(file.name))
23
+ with open(file_path, "wb") as f:
24
+ f.write(file.read())
25
+
26
+ # Initialize a new RAG Tool with the uploaded documents
27
+ global rag_tool
28
+ rag_tool = RAGTool(
29
+ documents_path="./uploaded_docs",
30
+ embedding_model=embedding_model,
31
+ vector_store_type=vector_store_type,
32
+ chunk_size=int(chunk_size),
33
+ chunk_overlap=int(chunk_overlap),
34
+ persist_directory="./uploaded_vector_store"
35
  )
 
 
 
 
 
36
 
37
+ return f"Documents uploaded and processed. Vector store created with {embedding_model} model."
 
 
38
 
39
+ # Function to handle queries
40
+ def query_documents(query, top_k):
41
+ global rag_tool
42
+ return rag_tool(query, top_k=int(top_k))
43
 
44
+ # Gradio interface
45
+ with gr.Blocks(title="Advanced RAG Tool") as demo:
46
+ gr.Markdown("# Advanced RAG Tool")
47
+ gr.Markdown("Upload documents and query them using semantic search")
48
+
49
+ with gr.Tab("Upload & Configure"):
50
+ with gr.Row():
51
+ with gr.Column():
52
+ files = gr.File(file_count="multiple", label="Upload Documents")
53
+ chunk_size = gr.Slider(200, 2000, value=1000, step=100, label="Chunk Size")
54
+ chunk_overlap = gr.Slider(0, 500, value=200, step=50, label="Chunk Overlap")
55
+
56
+ with gr.Column():
57
+ embedding_models = [
58
+ "sentence-transformers/all-MiniLM-L6-v2",
59
+ "BAAI/bge-small-en-v1.5",
60
+ "BAAI/bge-base-en-v1.5",
61
+ "thenlper/gte-small",
62
+ "thenlper/gte-base"
63
+ ]
64
+ embedding_model = gr.Dropdown(
65
+ choices=embedding_models,
66
+ value="sentence-transformers/all-MiniLM-L6-v2",
67
+ label="Embedding Model"
68
+ )
69
+ vector_store_type = gr.Radio(
70
+ choices=["faiss", "chroma"],
71
+ value="faiss",
72
+ label="Vector Store Type"
73
+ )
74
+
75
+ upload_button = gr.Button("Upload and Process Documents")
76
+ upload_result = gr.Textbox(label="Upload Result")
77
+
78
+ upload_button.click(
79
+ upload_documents,
80
+ inputs=[files, chunk_size, chunk_overlap, embedding_model, vector_store_type],
81
+ outputs=upload_result
82
+ )
83
+
84
+ with gr.Tab("Query Documents"):
85
+ query = gr.Textbox(label="Your Question", placeholder="What information are you looking for?")
86
+ top_k = gr.Slider(1, 10, value=3, step=1, label="Number of Results")
87
+ query_button = gr.Button("Search")
88
+ answer = gr.Textbox(label="Results")
89
+
90
+ query_button.click(
91
+ query_documents,
92
+ inputs=[query, top_k],
93
+ outputs=answer
94
+ )
95
+
96
+ # Launch the app
97
+ if __name__ == "__main__":
98
+ demo.launch()