Spaces:

Ani14
/

AutoReasearcher

Sleeping

App Files Files Community

Ani14 commited on Apr 21

Commit

88c4eee

verified ·

1 Parent(s): 7495b00

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -76

app.py CHANGED Viewed

@@ -3,22 +3,32 @@ import os
 import streamlit as st
 import requests
 import datetime
 from dotenv import load_dotenv
 from duckduckgo_search import DDGS
-import feedparser
-import time
 from fuzzywuzzy import fuzz
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
-# --- Helper Functions ---
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
-        "Authorization": f"Bearer " + OPENROUTER_API_KEY,
         "Content-Type": "application/json",
-        "X-Title": "GPT Deep Research Agent"
     }
     data = {
         "model": model,
@@ -34,7 +44,7 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
 def get_sources(topic, domains):
     query = topic
-    if domains and isinstance(domains, str):
         domain_list = [d.strip() for d in domains.split(",") if d.strip()]
         if domain_list:
             query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
@@ -55,17 +65,6 @@ def get_arxiv_papers(query):
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
-def get_semantic_papers(query):
-    url = "https://api.semanticscholar.org/graph/v1/paper/search"
-    params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
-    response = requests.get(url, params=params)
-    papers = response.json().get("data", [])
-    return [{
-        "title": p.get("title"),
-        "summary": p.get("abstract", "No abstract available"),
-        "url": p.get("url")
-    } for p in papers]
 def check_plagiarism(text, topic):
     hits = []
     for r in get_sources(topic, ""):
@@ -77,90 +76,89 @@ def check_plagiarism(text, topic):
 def generate_apa_citation(title, url, source):
     year = datetime.datetime.now().year
     label = {
-        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
     }.get(source, "*Web*")
     return f"{title}. ({year}). {label}. {url}"
-# --- Streamlit UI ---
-st.set_page_config("Deep Research Bot", layout="wide")
-st.title("🤖 Real-time Deep Research Agent")
-st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time.")
-topic = st.text_input("💡 What would you like me to research next?")
 report_type = st.selectbox("📄 Type of report", [
-    "Summary - Short and fast (~2 min)",
-    "Detailed Report (~5 min)",
-    "Thorough Academic Research (~10 min)"
 ])
-tone = st.selectbox("🎯 Tone of the report", [
-    "Objective - Impartial and unbiased presentation of facts and findings",
-    "Persuasive - Advocating a specific point of view",
-    "Narrative - Storytelling tone for layperson readers"
 ])
-source_type = st.selectbox("🌐 Sources to include", [
-    "Web Only", "Academic Only", "Hybrid"
-])
-custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
-if st.button("Research"):
     try:
-        with st.status("Starting agent tasks..."):
-            st.info("🧠 Thinking through research questions...")
-            time.sleep(1)
-            st.info("🌐 Fetching data from selected sources...")
-            all_data, citations = "", []
-            if source_type in ["Web Only", "Hybrid"]:
-                web = get_sources(topic, custom_domains)
-                for w in web:
-                    all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
-                    citations.append(generate_apa_citation(w["title"], w["url"], "web"))
-            if source_type in ["Academic Only", "Hybrid"]:
-                arxiv = get_arxiv_papers(topic)
-                for p in arxiv:
-                    all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
                     citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
-                scholar = get_semantic_papers(topic)
-                for s in scholar:
-                    all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
-                    citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
-            st.success("Data collection complete!")
-        with st.spinner("📝 Writing final research report..."):
             prompt = f"""
-# Research Task: {topic}
 Tone: {tone}
 Report Type: {report_type}
 Sources:
-{all_data}
-Now, synthesize:
-1. Research questions and gap
-2. A novel insight or direction
-3. A real-world application scenario
-4. A {report_type.lower()} in academic markdown (no headings)
-            """
-            output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
         st.subheader("📄 Research Report")
-        st.markdown(output, unsafe_allow_html=True)
         st.markdown("### 📚 APA Citations")
         for c in citations:
             st.markdown(f"- {c}")
-        with st.spinner("🧪 Checking for overlaps..."):
-            overlaps = check_plagiarism(output, topic)
             if overlaps:
-                st.warning("⚠️ Potential content overlap found.")
                 for h in overlaps:
-                    st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
             else:
-                st.success("✅ No major overlaps detected.")
     except Exception as e:
-        st.error(f"Error: {e}")

 import streamlit as st
 import requests
 import datetime
+import time
+import feedparser
 from dotenv import load_dotenv
 from duckduckgo_search import DDGS
 from fuzzywuzzy import fuzz
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.document_loaders import TextLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains import RetrievalQA
+from langchain.chat_models import ChatOpenAI
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+st.set_page_config("Advanced RAG Research Agent", layout="wide")
+st.title("🧠 Advanced Deep Research Agent (RAG + Documents + Real-time)")
+# --- Core Utilities ---
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
         "Content-Type": "application/json",
+        "X-Title": "RAG Deep Research Agent"
     }
     data = {
         "model": model,
 def get_sources(topic, domains):
     query = topic
+    if domains:
         domain_list = [d.strip() for d in domains.split(",") if d.strip()]
         if domain_list:
             query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
 def check_plagiarism(text, topic):
     hits = []
     for r in get_sources(topic, ""):
 def generate_apa_citation(title, url, source):
     year = datetime.datetime.now().year
     label = {
+        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*", "local": "*Uploaded Document*"
     }.get(source, "*Web*")
     return f"{title}. ({year}). {label}. {url}"
+# --- RAG Processing from Uploaded Documents ---
+def process_uploaded_docs(uploaded_files):
+    docs = []
+    for file in uploaded_files:
+        ext = file.name.split(".")[-1].lower()
+        with open(f"/tmp/{file.name}", "wb") as f:
+            f.write(file.read())
+        loader = PyPDFLoader(f"/tmp/{file.name}") if ext == "pdf" else TextLoader(f"/tmp/{file.name}")
+        docs.extend(loader.load())
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    split_docs = splitter.split_documents(docs)
+    vectorstore = FAISS.from_documents(split_docs, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
+    return RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY), retriever=vectorstore.as_retriever())
+# --- UI Layout ---
+topic = st.text_input("💡 Research Topic")
 report_type = st.selectbox("📄 Type of report", [
+    "Summary - Short and fast", "Detailed Report", "Thorough Academic Research"
 ])
+tone = st.selectbox("🎯 Tone", [
+    "Objective", "Persuasive", "Narrative"
 ])
+source_type = st.selectbox("🌐 Sources", ["Web", "Documents", "Hybrid"])
+custom_domains = st.text_input("🔍 Query Domains (optional)", placeholder="example.com, site.org")
+uploaded_files = st.file_uploader("📁 Upload PDFs or text documents", accept_multiple_files=True)
+if st.button("🔍 Start Research"):
     try:
+        collected, citations = "", []
+        st.status("🧠 Agent initializing...")
+        if source_type in ["Web", "Hybrid"]:
+            with st.spinner("🌐 Collecting web and arXiv data..."):
+                sources = get_sources(topic, custom_domains)
+                for s in sources:
+                    collected += f"- [{s['title']}]({s['url']})\n> {s['snippet']}\n\n"
+                    citations.append(generate_apa_citation(s["title"], s["url"], "web"))
+                for p in get_arxiv_papers(topic):
+                    collected += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
                     citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
+        if source_type in ["Documents", "Hybrid"] and uploaded_files:
+            with st.spinner("📁 Processing documents with RAG..."):
+                rag_chain = process_uploaded_docs(uploaded_files)
+                rag_output = rag_chain.run(f"Summarize everything about: {topic}")
+                collected += f"\n**Document Insights:**\n\n{rag_output}\n"
+                citations.append(generate_apa_citation("Uploaded Materials", "local", "local"))
+        with st.spinner("📝 Writing final report..."):
             prompt = f"""
+Topic: {topic}
 Tone: {tone}
 Report Type: {report_type}
 Sources:
+{collected}
+Now generate:
+1. Research gap
+2. Novel direction
+3. Real-world example
+4. Full article in markdown format
+"""
+            response = call_llm([{"role": "user", "content": prompt}], max_tokens=3000)
         st.subheader("📄 Research Report")
+        st.markdown(response, unsafe_allow_html=True)
         st.markdown("### 📚 APA Citations")
         for c in citations:
             st.markdown(f"- {c}")
+        with st.spinner("🔍 Checking for plagiarism..."):
+            overlaps = check_plagiarism(response, topic)
             if overlaps:
+                st.warning("⚠️ Content overlap found")
                 for h in overlaps:
+                    st.markdown(f"**{h['title']}** — [{h['url']}]({h['url']})")
             else:
+                st.success("✅ No overlap detected")
     except Exception as e:
+        st.error(f"🚨 Error: {e}")