File size: 6,194 Bytes
42eb736
 
 
 
 
 
 
 
864d264
42eb736
 
 
 
 
 
 
 
 
 
 
 
 
 
864d264
 
 
fa3169a
 
 
 
 
 
 
 
864d264
fa3169a
864d264
fa3169a
864d264
 
 
42eb736
 
 
 
 
 
 
864d264
42eb736
 
 
 
864d264
42eb736
 
 
864d264
42eb736
 
 
 
864d264
42eb736
 
 
 
 
 
 
 
864d264
42eb736
 
 
 
 
 
 
864d264
42eb736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
864d264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee979d1
864d264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import matplotlib.pyplot as plt
import numpy as np
from groq import Groq
import faiss

GROQ_API_KEY = "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"
client = Groq(api_key=GROQ_API_KEY)

# Initialize Embedding Model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize FAISS Index
embedding_dim = 384  # Dimensionality of 'all-MiniLM-L6-v2'
faiss_index = faiss.IndexFlatL2(embedding_dim)

# Store Metadata
metadata_store = []

# Function to identify unanswered questions based on comparative analysis of multiple papers
def identify_research_gaps(chunks):
    unanswered_questions = []
    # List of keywords related to research gaps
    keywords = [
        "future research", "unanswered questions", "limitations", "need for more research", 
        "open questions", "further investigation", "research gap", "areas for improvement",
        "unknowns", "unexplored", "unresolved issues"
    ]
    
    # Search for chunks containing any of the keywords
    for chunk in chunks:
        if any(keyword in chunk.lower() for keyword in keywords):
            unanswered_questions.append(chunk)
    
    return "\n".join(unanswered_questions) if unanswered_questions else "No specific unanswered questions found."

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to chunk text
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Function to generate embeddings
def generate_embeddings(chunks):
    return embedding_model.encode(chunks)

# Store embeddings in FAISS index
def store_embeddings(embeddings, metadata):
    faiss_index.add(np.array(embeddings))
    metadata_store.extend(metadata)

# Retrieve relevant chunks based on query
def retrieve_relevant_chunks(query, k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = faiss_index.search(query_embedding, k)
    valid_results = [
        (metadata_store[i], distances[0][j]) for j, i in enumerate(indices[0]) if i < len(metadata_store)
    ]
    return valid_results

# Call Groq API to get answers and research gap analysis
def ask_groq_api(question, context):
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": f"{context}\n\n{question}"}],
        model="llama3-8b-8192"
    )
    return chat_completion.choices[0].message.content

# Streamlit UI setup
st.title("RAG-Based Research Paper Analyzer")

uploaded_files = st.file_uploader("Upload PDF Files", accept_multiple_files=True, type="pdf")

if uploaded_files:
    all_chunks = []
    all_metadata = []
    for uploaded_file in uploaded_files:
        text = extract_text_from_pdf(uploaded_file)
        chunks = chunk_text(text)
        embeddings = generate_embeddings(chunks)
        metadata = [{"chunk": chunk, "file_name": uploaded_file.name} for chunk in chunks]
        store_embeddings(embeddings, metadata)
        all_chunks.extend(chunks)
        all_metadata.extend(metadata)

    st.success("Files uploaded and processed successfully!")

    # Button to view topic summaries with an emoji
    if st.button("View Topic Summaries", help="Click to view a brief summary of the uploaded papers", icon="πŸ“š"):
        for chunk in all_chunks[:3]:
            st.write(chunk)

    # User input for query without the icon
    user_question = st.text_input("Ask a question about the uploaded papers:", help="Ask about specific research details")

    if user_question:
        relevant_chunks = retrieve_relevant_chunks(user_question)
        if relevant_chunks:
            context = "\n\n".join([chunk['chunk'] for chunk, _ in relevant_chunks])
            answer = ask_groq_api(user_question, context)
            st.write("**Answer:**", answer)

            # Implement Research Gap Identification based on inconsistencies between papers
            st.subheader("Research Gap Analysis:", icon="⚠️")
            # We will analyze the chunks and context to identify research gaps
            research_gap = analyze_research_gaps(all_chunks)
            st.write(f"**Research Gaps Identified:** {research_gap}")
        else:
            st.write("No relevant sections found for your question.")

    # Adding an emoji for research gap feature
    if st.button("Identify Research Gaps", help="Find unanswered questions or areas where research is lacking", icon="⚠️"):
        st.write("**Research Gap Analysis:**")
        # Implementing research gap analysis based on comparing papers
        research_gap_analysis = identify_research_gaps(all_chunks)
        st.write(research_gap_analysis)

    # Button to generate scatter plot with a chart emoji
    if st.button("Generate Scatter Plot", icon="πŸ“Š"):
        st.write("Generating scatter plot for methods vs. results...")
        # Example scatter plot (replace with real data)
        x = np.random.rand(10)
        y = np.random.rand(10)
        plt.scatter(x, y)
        plt.xlabel("Methods")
        plt.ylabel("Results")
        st.pyplot(plt)

    # Text area for annotations without the icon
    st.text_area("Annotate Your Insights:", height=100, key="annotations", help="Add your thoughts or comments here")

# Function to analyze and identify research gaps by comparing chunks from different papers
def analyze_research_gaps(chunks):
    # Here we would compare text from different papers to identify discrepancies
    gaps = []
    for i, chunk_1 in enumerate(chunks):
        for j, chunk_2 in enumerate(chunks):
            if i != j:
                # Simple heuristic to compare chunks for inconsistencies or gaps
                if chunk_1[:100] != chunk_2[:100]:  # Checking first 100 characters for difference
                    gaps.append(f"Potential inconsistency between chunk {i} and chunk {j}.")
    return "\n".join(gaps) if gaps else "No major inconsistencies found."