Spaces:

Ani14
/

AutoReasearcher

Sleeping

App Files Files Community

Ani14 commited on Apr 21

Commit

4083831

verified ·

1 Parent(s): b8ed084

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -79

app.py CHANGED Viewed

@@ -3,32 +3,22 @@ import os
 import streamlit as st
 import requests
 import datetime
-import time
-import feedparser
 from dotenv import load_dotenv
 from duckduckgo_search import DDGS
 from fuzzywuzzy import fuzz
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.document_loaders import TextLoader, PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.chains import RetrievalQA
-from langchain.chat_models import ChatOpenAI
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-st.set_page_config("Advanced RAG Research Agent", layout="wide")
-st.title("🧠 Advanced Deep Research Agent (RAG + Documents + Real-time)")
-# --- Core Utilities ---
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
-        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
         "Content-Type": "application/json",
-        "X-Title": "RAG Deep Research Agent"
     }
     data = {
         "model": model,
@@ -43,17 +33,12 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
     return result["choices"][0]["message"]["content"]
 def get_sources(topic, domains):
-    query = topic
-    if domains:
-        domain_list = [d.strip() for d in domains.split(",") if d.strip()]
-        if domain_list:
-            query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
     with DDGS() as ddgs:
         return [{
             "title": r.get("title", "Untitled"),
             "snippet": r.get("body", ""),
             "url": r.get("href", "")
-        } for r in ddgs.text(query, max_results=5)]
 def get_arxiv_papers(query):
     from urllib.parse import quote_plus
@@ -65,6 +50,17 @@ def get_arxiv_papers(query):
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
 def check_plagiarism(text, topic):
     hits = []
     for r in get_sources(topic, ""):
@@ -76,89 +72,90 @@ def check_plagiarism(text, topic):
 def generate_apa_citation(title, url, source):
     year = datetime.datetime.now().year
     label = {
-        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*", "local": "*Uploaded Document*"
     }.get(source, "*Web*")
     return f"{title}. ({year}). {label}. {url}"
-# --- RAG Processing from Uploaded Documents ---
-def process_uploaded_docs(uploaded_files):
-    docs = []
-    for file in uploaded_files:
-        ext = file.name.split(".")[-1].lower()
-        with open(f"/tmp/{file.name}", "wb") as f:
-            f.write(file.read())
-        loader = PyPDFLoader(f"/tmp/{file.name}") if ext == "pdf" else TextLoader(f"/tmp/{file.name}")
-        docs.extend(loader.load())
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    split_docs = splitter.split_documents(docs)
-    vectorstore = FAISS.from_documents(split_docs, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
-    return RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY), retriever=vectorstore.as_retriever())
-# --- UI Layout ---
-topic = st.text_input("💡 Research Topic")
 report_type = st.selectbox("📄 Type of report", [
-    "Summary - Short and fast", "Detailed Report", "Thorough Academic Research"
 ])
-tone = st.selectbox("🎯 Tone", [
-    "Objective", "Persuasive", "Narrative"
 ])
-source_type = st.selectbox("🌐 Sources", ["Web", "Documents", "Hybrid"])
-custom_domains = st.text_input("🔍 Query Domains (optional)", placeholder="example.com, site.org")
-uploaded_files = st.file_uploader("📁 Upload PDFs or text documents", accept_multiple_files=True)
-if st.button("🔍 Start Research"):
     try:
-        collected, citations = "", []
-        st.status("🧠 Agent initializing...")
-        if source_type in ["Web", "Hybrid"]:
-            with st.spinner("🌐 Collecting web and arXiv data..."):
-                sources = get_sources(topic, custom_domains)
-                for s in sources:
-                    collected += f"- [{s['title']}]({s['url']})\n> {s['snippet']}\n\n"
-                    citations.append(generate_apa_citation(s["title"], s["url"], "web"))
-                for p in get_arxiv_papers(topic):
-                    collected += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
                     citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
-        if source_type in ["Documents", "Hybrid"] and uploaded_files:
-            with st.spinner("📁 Processing documents with RAG..."):
-                rag_chain = process_uploaded_docs(uploaded_files)
-                rag_output = rag_chain.run(f"Summarize everything about: {topic}")
-                collected += f"\n**Document Insights:**\n\n{rag_output}\n"
-                citations.append(generate_apa_citation("Uploaded Materials", "local", "local"))
-        with st.spinner("📝 Writing final report..."):
             prompt = f"""
-Topic: {topic}
 Tone: {tone}
 Report Type: {report_type}
 Sources:
-{collected}
-Now generate:
-1. Research gap
-2. Novel direction
-3. Real-world example
-4. Full article in markdown format
-"""
-            response = call_llm([{"role": "user", "content": prompt}], max_tokens=3000)
         st.subheader("📄 Research Report")
-        st.markdown(response, unsafe_allow_html=True)
         st.markdown("### 📚 APA Citations")
         for c in citations:
             st.markdown(f"- {c}")
-        with st.spinner("🔍 Checking for plagiarism..."):
-            overlaps = check_plagiarism(response, topic)
             if overlaps:
-                st.warning("⚠️ Content overlap found")
                 for h in overlaps:
-                    st.markdown(f"**{h['title']}** — [{h['url']}]({h['url']})")
             else:
-                st.success("✅ No overlap detected")
     except Exception as e:
-        st.error(f"🚨 Error: {e}")

 import streamlit as st
 import requests
 import datetime
 from dotenv import load_dotenv
 from duckduckgo_search import DDGS
+import feedparser
+import time
 from fuzzywuzzy import fuzz
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+# --- Helper Functions ---
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
+        "Authorization": f"Bearer " + OPENROUTER_API_KEY,
         "Content-Type": "application/json",
+        "X-Title": "GPT Deep Research Agent"
     }
     data = {
         "model": model,
     return result["choices"][0]["message"]["content"]
 def get_sources(topic, domains):
     with DDGS() as ddgs:
         return [{
             "title": r.get("title", "Untitled"),
             "snippet": r.get("body", ""),
             "url": r.get("href", "")
+        } for r in ddgs.text(topic + " site:" + domains if domains else topic, max_results=5)]
 def get_arxiv_papers(query):
     from urllib.parse import quote_plus
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
+def get_semantic_papers(query):
+    url = "https://api.semanticscholar.org/graph/v1/paper/search"
+    params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
+    response = requests.get(url, params=params)
+    papers = response.json().get("data", [])
+    return [{
+        "title": p.get("title"),
+        "summary": p.get("abstract", "No abstract available"),
+        "url": p.get("url")
+    } for p in papers]
 def check_plagiarism(text, topic):
     hits = []
     for r in get_sources(topic, ""):
 def generate_apa_citation(title, url, source):
     year = datetime.datetime.now().year
     label = {
+        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
     }.get(source, "*Web*")
     return f"{title}. ({year}). {label}. {url}"
+# --- Streamlit UI ---
+st.set_page_config("Deep Research Bot", layout="wide")
+st.title("🤖 Real-time Deep Research Agent")
+st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time.")
+topic = st.text_input("💡 What would you like me to research next?")
 report_type = st.selectbox("📄 Type of report", [
+    "Summary - Short and fast (~2 min)",
+    "Detailed Report (~5 min)",
+    "Thorough Academic Research (~10 min)"
 ])
+tone = st.selectbox("🎯 Tone of the report", [
+    "Objective - Impartial and unbiased presentation of facts and findings",
+    "Persuasive - Advocating a specific point of view",
+    "Narrative - Storytelling tone for layperson readers"
 ])
+source_type = st.selectbox("🌐 Sources to include", [
+    "Web Only", "Academic Only", "Hybrid"
+])
+custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
+if st.button("Research"):
     try:
+        with st.status("Starting agent tasks..."):
+            st.info("🧠 Thinking through research questions...")
+            time.sleep(1)
+            st.info("🌐 Fetching data from selected sources...")
+            all_data, citations = "", []
+            if source_type in ["Web Only", "Hybrid"]:
+                web = get_sources(topic, custom_domains)
+                for w in web:
+                    all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
+                    citations.append(generate_apa_citation(w["title"], w["url"], "web"))
+            if source_type in ["Academic Only", "Hybrid"]:
+                arxiv = get_arxiv_papers(topic)
+                for p in arxiv:
+                    all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
                     citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
+                scholar = get_semantic_papers(topic)
+                for s in scholar:
+                    all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
+                    citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
+            st.success("Data collection complete!")
+        with st.spinner("📝 Writing final research report..."):
             prompt = f"""
+# Research Task: {topic}
 Tone: {tone}
 Report Type: {report_type}
 Sources:
+{all_data}
+Now, synthesize:
+1. Research questions and gap
+2. A novel insight or direction
+3. A real-world application scenario
+4. A {report_type.lower()} in academic markdown (no headings)
+            """
+            output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
         st.subheader("📄 Research Report")
+        st.markdown(output, unsafe_allow_html=True)
         st.markdown("### 📚 APA Citations")
         for c in citations:
             st.markdown(f"- {c}")
+        with st.spinner("🧪 Checking for overlaps..."):
+            overlaps = check_plagiarism(output, topic)
             if overlaps:
+                st.warning("⚠️ Potential content overlap found.")
                 for h in overlaps:
+                    st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
             else:
+                st.success("✅ No major overlaps detected.")
     except Exception as e:
+        st.error(f"Error: {e}")