import os import streamlit as st from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer import faiss import matplotlib.pyplot as plt import numpy as np from groq import Groq import faiss GROQ_API_KEY = "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" client = Groq(api_key=GROQ_API_KEY) # Initialize Embedding Model embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Initialize FAISS Index embedding_dim = 384 # Dimensionality of 'all-MiniLM-L6-v2' faiss_index = faiss.IndexFlatL2(embedding_dim) # Store Metadata metadata_store = [] # Function to identify unanswered questions based on comparative analysis of multiple papers def identify_research_gaps(chunks): unanswered_questions = [] # List of keywords related to research gaps keywords = [ "future research", "unanswered questions", "limitations", "need for more research", "open questions", "further investigation", "research gap", "areas for improvement", "unknowns", "unexplored", "unresolved issues" ] # Search for chunks containing any of the keywords for chunk in chunks: if any(keyword in chunk.lower() for keyword in keywords): unanswered_questions.append(chunk) return "\n".join(unanswered_questions) if unanswered_questions else "No specific unanswered questions found." # Function to extract text from PDFs def extract_text_from_pdf(pdf_file): pdf_reader = PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text # Function to chunk text def chunk_text(text, chunk_size=500): words = text.split() return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] # Function to generate embeddings def generate_embeddings(chunks): return embedding_model.encode(chunks) # Store embeddings in FAISS index def store_embeddings(embeddings, metadata): faiss_index.add(np.array(embeddings)) metadata_store.extend(metadata) # Retrieve relevant chunks based on query def retrieve_relevant_chunks(query, k=5): query_embedding = embedding_model.encode([query]) distances, indices = faiss_index.search(query_embedding, k) valid_results = [ (metadata_store[i], distances[0][j]) for j, i in enumerate(indices[0]) if i < len(metadata_store) ] return valid_results # Call Groq API to get answers and research gap analysis def ask_groq_api(question, context): chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": f"{context}\n\n{question}"}], model="llama3-8b-8192" ) return chat_completion.choices[0].message.content # Streamlit UI setup st.title("RAG-Based Research Paper Analyzer") uploaded_files = st.file_uploader("Upload PDF Files", accept_multiple_files=True, type="pdf") if uploaded_files: all_chunks = [] all_metadata = [] for uploaded_file in uploaded_files: text = extract_text_from_pdf(uploaded_file) chunks = chunk_text(text) embeddings = generate_embeddings(chunks) metadata = [{"chunk": chunk, "file_name": uploaded_file.name} for chunk in chunks] store_embeddings(embeddings, metadata) all_chunks.extend(chunks) all_metadata.extend(metadata) st.success("Files uploaded and processed successfully!") # Button to view topic summaries with an emoji if st.button("View Topic Summaries", help="Click to view a brief summary of the uploaded papers", icon="📚"): for chunk in all_chunks[:3]: st.write(chunk) # User input for query without the icon user_question = st.text_input("Ask a question about the uploaded papers:", help="Ask about specific research details") if user_question: relevant_chunks = retrieve_relevant_chunks(user_question) if relevant_chunks: context = "\n\n".join([chunk['chunk'] for chunk, _ in relevant_chunks]) answer = ask_groq_api(user_question, context) st.write("**Answer:**", answer) # Implement Research Gap Identification based on inconsistencies between papers st.subheader("Research Gap Analysis:", icon="⚠️") # We will analyze the chunks and context to identify research gaps research_gap = analyze_research_gaps(all_chunks) st.write(f"**Research Gaps Identified:** {research_gap}") else: st.write("No relevant sections found for your question.") # Adding an emoji for research gap feature if st.button("Identify Research Gaps", help="Find unanswered questions or areas where research is lacking", icon="⚠️"): st.write("**Research Gap Analysis:**") # Implementing research gap analysis based on comparing papers research_gap_analysis = identify_research_gaps(all_chunks) st.write(research_gap_analysis) # Button to generate scatter plot with a chart emoji if st.button("Generate Scatter Plot", icon="📊"): st.write("Generating scatter plot for methods vs. results...") # Example scatter plot (replace with real data) x = np.random.rand(10) y = np.random.rand(10) plt.scatter(x, y) plt.xlabel("Methods") plt.ylabel("Results") st.pyplot(plt) # Text area for annotations without the icon st.text_area("Annotate Your Insights:", height=100, key="annotations", help="Add your thoughts or comments here") # Function to analyze and identify research gaps by comparing chunks from different papers def analyze_research_gaps(chunks): # Here we would compare text from different papers to identify discrepancies gaps = [] for i, chunk_1 in enumerate(chunks): for j, chunk_2 in enumerate(chunks): if i != j: # Simple heuristic to compare chunks for inconsistencies or gaps if chunk_1[:100] != chunk_2[:100]: # Checking first 100 characters for difference gaps.append(f"Potential inconsistency between chunk {i} and chunk {j}.") return "\n".join(gaps) if gaps else "No major inconsistencies found."