import streamlit as st from PyPDF2 import PdfReader from transformers import pipeline, AutoTokenizer, AutoModel from sklearn.feature_extraction.text import TfidfVectorizer import faiss import numpy as np # Load the Hugging Face model for text generation @st.cache_resource def load_text_generator(): return pipeline("text2text-generation", model="google/flan-t5-base") # Load the Hugging Face model for embeddings @st.cache_resource def load_embedding_model(): tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") return tokenizer, model text_generator = load_text_generator() embedding_tokenizer, embedding_model = load_embedding_model() # Function to extract text from PDF def extract_pdf_content(pdf_file): reader = PdfReader(pdf_file) content = "" for page in reader.pages: content += page.extract_text() return content # Function to split content into chunks def chunk_text(text, chunk_size=500): words = text.split() return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] # Function to compute embeddings def compute_embeddings(text_chunks): embeddings = [] for chunk in text_chunks: inputs = embedding_tokenizer(chunk, return_tensors="pt", truncation=True, padding=True) outputs = embedding_model(**inputs) embeddings.append(outputs.pooler_output.detach().numpy()[0]) return np.array(embeddings) # Function to build FAISS index def build_faiss_index(embeddings): dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) # L2 distance for similarity index.add(embeddings) return index # Function to search in FAISS index def search_faiss_index(index, query_embedding, text_chunks, top_k=3): distances, indices = index.search(query_embedding, top_k) return [(text_chunks[idx], distances[0][i]) for i, idx in enumerate(indices[0])] # Function to generate structured content def generate_professional_content(topic): prompt = f"Explain '{topic}' in bullet points, highlighting key concepts, examples, and applications." response = text_generator(prompt, max_length=300, num_return_sequences=1) return response[0]['generated_text'] # Function to compute query embedding def compute_query_embedding(query): inputs = embedding_tokenizer(query, return_tensors="pt", truncation=True, padding=True) outputs = embedding_model(**inputs) return outputs.pooler_output.detach().numpy() # Streamlit app st.title("Generative AI for Electrical Engineering Education with FAISS") st.sidebar.header("AI-Based Tutor with Vector Search") # File upload section uploaded_file = st.sidebar.file_uploader("Upload Study Material (PDF)", type=["pdf"]) topic = st.sidebar.text_input("Enter a topic (e.g., Newton's Third Law)") if uploaded_file: # Extract and process file content content = extract_pdf_content(uploaded_file) st.sidebar.success(f"{uploaded_file.name} uploaded successfully!") # Chunk and compute embeddings chunks = chunk_text(content) embeddings = compute_embeddings(chunks) # Build FAISS index index = build_faiss_index(embeddings) st.write("**File Processed and Indexed for Search**") st.write(f"Total chunks created: {len(chunks)}") # Generate study material if st.button("Generate Study Material"): if topic: st.header(f"Study Material: {topic}") # Compute query embedding query_embedding = compute_query_embedding(topic) # Search FAISS index if uploaded_file: results = search_faiss_index(index, query_embedding, chunks, top_k=3) st.write("**Relevant Content from Uploaded File:**") for result, distance in results: st.write(f"- {result} (Similarity: {distance:.2f})") else: st.warning("No file uploaded. Generating AI-based content instead.") # Generate AI content ai_content = generate_professional_content(topic) st.write("**AI-Generated Content:**") st.write(ai_content) else: st.warning("Please enter a topic!")