File size: 5,903 Bytes
42eb736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b1eefc
 
42eb736
 
 
887a604
 
42eb736
 
 
 
 
 
 
 
 
8b1eefc
ee979d1
 
42eb736
 
 
 
8b1eefc
 
42eb736
ee979d1
 
42eb736
 
8b1eefc
 
42eb736
 
 
 
 
 
 
 
 
7cc8ff3
 
ee979d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import matplotlib.pyplot as plt
import numpy as np
from groq import Groq
import faiss

GROQ_API_KEY = "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"
client = Groq(api_key=GROQ_API_KEY)

# Initialize Embedding Model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize FAISS Index
embedding_dim = 384  # Dimensionality of 'all-MiniLM-L6-v2'
faiss_index = faiss.IndexFlatL2(embedding_dim)

# Store Metadata
metadata_store = []

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to chunk text
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Function to generate embeddings
def generate_embeddings(chunks):
    return embedding_model.encode(chunks)

# Store embeddings in FAISS index
def store_embeddings(embeddings, metadata):
    faiss_index.add(np.array(embeddings))
    metadata_store.extend(metadata)

# Retrieve relevant chunks based on query
def retrieve_relevant_chunks(query, k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = faiss_index.search(query_embedding, k)
    valid_results = [
        (metadata_store[i], distances[0][j]) for j, i in enumerate(indices[0]) if i < len(metadata_store)
    ]
    return valid_results

# Call Groq API to get answers and research gap analysis
def ask_groq_api(question, context):
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": f"{context}\n\n{question}"}],
        model="llama3-8b-8192"
    )
    return chat_completion.choices[0].message.content

# Streamlit UI setup
st.title("RAG-Based Research Paper Analyzer")

uploaded_files = st.file_uploader("Upload PDF Files", accept_multiple_files=True, type="pdf")

if uploaded_files:
    all_chunks = []
    all_metadata = []
    for uploaded_file in uploaded_files:
        text = extract_text_from_pdf(uploaded_file)
        chunks = chunk_text(text)
        embeddings = generate_embeddings(chunks)
        metadata = [{"chunk": chunk, "file_name": uploaded_file.name} for chunk in chunks]
        store_embeddings(embeddings, metadata)
        all_chunks.extend(chunks)
        all_metadata.extend(metadata)

    st.success("Files uploaded and processed successfully!")

    # Button to view topic summaries with an emoji
    if st.button("View Topic Summaries", help="Click to view a brief summary of the uploaded papers", icon="πŸ“š"):
        for chunk in all_chunks[:3]:
            st.write(chunk)

    # User input for query without the icon
    user_question = st.text_input("Ask a question about the uploaded papers:", help="Ask about specific research details")

    if user_question:
        relevant_chunks = retrieve_relevant_chunks(user_question)
        if relevant_chunks:
            context = "\n\n".join([chunk['chunk'] for chunk, _ in relevant_chunks])
            answer = ask_groq_api(user_question, context)
            st.write("**Answer:**", answer)

            # Implement Research Gap Identification based on inconsistencies between papers
            st.subheader("Research Gap Analysis:", icon="⚠️")
            # We will analyze the chunks and context to identify research gaps
            research_gap = analyze_research_gaps(all_chunks)
            st.write(f"**Research Gaps Identified:** {research_gap}")
        else:
            st.write("No relevant sections found for your question.")

    # Adding an emoji for research gap feature
    if st.button("Identify Research Gaps", help="Find unanswered questions or areas where research is lacking", icon="⚠️"):
        st.write("**Research Gap Analysis:**")
        # Implementing research gap analysis based on comparing papers
        research_gap_analysis = identify_research_gaps(all_chunks)
        st.write(research_gap_analysis)

    # Button to generate scatter plot with a chart emoji
    if st.button("Generate Scatter Plot", icon="πŸ“Š"):
        st.write("Generating scatter plot for methods vs. results...")
        # Example scatter plot (replace with real data)
        x = np.random.rand(10)
        y = np.random.rand(10)
        plt.scatter(x, y)
        plt.xlabel("Methods")
        plt.ylabel("Results")
        st.pyplot(plt)

    # Text area for annotations without the icon
    st.text_area("Annotate Your Insights:", height=100, key="annotations", help="Add your thoughts or comments here")

# Function to analyze and identify research gaps by comparing chunks from different papers
def analyze_research_gaps(chunks):
    # Here we would compare text from different papers to identify discrepancies
    gaps = []
    for i, chunk_1 in enumerate(chunks):
        for j, chunk_2 in enumerate(chunks):
            if i != j:
                # Simple heuristic to compare chunks for inconsistencies or gaps
                if chunk_1[:100] != chunk_2[:100]:  # Checking first 100 characters for difference
                    gaps.append(f"Potential inconsistency between chunk {i} and chunk {j}.")
    return "\n".join(gaps) if gaps else "No major inconsistencies found."

# Function to identify unanswered questions based on comparative analysis of multiple papers
def identify_research_gaps(chunks):
    unanswered_questions = []
    # Simulate a simple search for keywords related to unanswered questions
    for chunk in chunks:
        if "future research" in chunk or "unanswered questions" in chunk:
            unanswered_questions.append(chunk)
    return "\n".join(unanswered_questions) if unanswered_questions else "No specific unanswered questions found."