Cipher29 commited on
Commit
7894e62
·
verified ·
1 Parent(s): d23e82f

Upload 3 files

Browse files
TheyKnow/Gumball.docx ADDED
Binary file (20.7 kB). View file
 
TheyKnow/RAG.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from docx import Document
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import pipeline
5
+ import faiss
6
+ import numpy as np
7
+
8
+ # Initialize the sentence transformer for embeddings
9
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
10
+
11
+ # Initialize the text generation model (Flan-T5)
12
+ pipe = pipeline("text2text-generation", model="google/flan-t5-large")
13
+
14
+ # Path to the pre-defined .docx document
15
+ file_path = "/Users/estebanm/Desktop/DS-TEST/TheyKnow/Gumball.docx"
16
+
17
+ # Load and process the .docx document
18
+ def load_docx(file_path):
19
+ """Load text from a .docx file and return a list of paragraphs."""
20
+ doc = Document(file_path)
21
+ text_chunks = [para.text for para in doc.paragraphs if para.text]
22
+ return text_chunks
23
+
24
+ # Create FAISS index for similarity search
25
+ index = None # Global index for the uploaded document
26
+ gumball_document = [] # To store the document content
27
+
28
+ def create_faiss_index(text_chunks):
29
+ """Create a FAISS index from text chunks."""
30
+ global index
31
+ # Embed the chunks
32
+ document_embeddings = embedder.encode(text_chunks)
33
+
34
+ # Initialize and add embeddings to FAISS index
35
+ dimension = document_embeddings.shape[1]
36
+ index = faiss.IndexFlatL2(dimension)
37
+ index.add(np.array(document_embeddings))
38
+
39
+ def retrieve_relevant_text(question, top_k=3):
40
+ """Retrieve the most relevant text chunks based on the question."""
41
+ question_embedding = embedder.encode([question])
42
+ distances, indices = index.search(np.array(question_embedding), top_k)
43
+ return [gumball_document[idx] for idx in indices[0]]
44
+
45
+ # Streamlit App
46
+ st.title("RAG with The Amazing World of Gumball")
47
+
48
+ # Automatically load and process the document from the specified path
49
+ gumball_document = load_docx(file_path)
50
+ create_faiss_index(gumball_document) # Create FAISS index
51
+
52
+ # Display input for question
53
+ question = st.text_input("Ask a question:")
54
+
55
+ # Slider to set the number of relevant passages to retrieve
56
+ top_k = st.slider("Number of relevant passages to retrieve:", 1, 5, 3)
57
+
58
+ if question:
59
+ # Retrieve relevant text from the document
60
+ relevant_texts = retrieve_relevant_text(question, top_k=top_k)
61
+ context = " ".join(relevant_texts)
62
+
63
+ # Generate answer using Flan-T5 model
64
+ generated_answer = pipe(f"question: {question} context: {context}", max_length=100)[0]["generated_text"]
65
+
66
+ # Display the answer and context used
67
+ st.write("Answer:", generated_answer)
TheyKnow/~$umball.docx ADDED
Binary file (162 Bytes). View file