Spaces:

Ani14
/

AutoReasearcher

Running

App Files Files Community

Ani14 commited on Apr 21

Commit

2b04ac1

verified ·

1 Parent(s): 65c3858

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -166

app.py CHANGED Viewed

@@ -1,22 +1,26 @@
 import os
 import streamlit as st
 import requests
-import feedparser
 import datetime
-from fuzzywuzzy import fuzz
 from dotenv import load_dotenv
-from duckduckgo_search import DDGS
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
-# --- Call OpenRouter LLM ---
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
         "Authorization": f"Bearer {OPENROUTER_API_KEY}",
         "Content-Type": "application/json",
-        "X-Title": "Autonomous Research Assistant"
     }
     data = {
         "model": model,
@@ -24,183 +28,138 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
         "max_tokens": max_tokens,
         "temperature": temperature
     }
-    try:
-        response = requests.post(url, headers=headers, json=data)
-        result = response.json()
-    except Exception as e:
-        raise RuntimeError(f"Failed to connect or parse response: {e}")
     if response.status_code != 200:
         raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
-    if "choices" not in result:
-        raise RuntimeError(f"Invalid response: {result}")
     return result["choices"][0]["message"]["content"]
-# --- Plagiarism Check ---
-def check_plagiarism(text, query, threshold=70):
-    web_results = search_duckduckgo(query, max_results=5)
-    plagiarized_snippets = []
-    for result in web_results:
-        snippet = result.get("snippet", "")
-        similarity = fuzz.token_set_ratio(text, snippet)
-        if similarity >= threshold:
-            plagiarized_snippets.append({
-                "title": result["title"],
-                "url": result["url"],
-                "snippet": snippet,
-                "similarity": similarity
-            })
-    return plagiarized_snippets
-# --- Source Utilities ---
-def get_arxiv_papers(query, max_results=3):
     from urllib.parse import quote_plus
-    url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results={max_results}"
     feed = feedparser.parse(url)
     return [{
-        "title": e.title or "Untitled",
-        "summary": (e.summary or "No summary available").replace("\n", " ").strip(),
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
-def get_semantic_scholar_papers(query, max_results=3):
     url = "https://api.semanticscholar.org/graph/v1/paper/search"
-    params = {"query": query, "limit": max_results, "fields": "title,abstract,url"}
     response = requests.get(url, params=params)
     papers = response.json().get("data", [])
     return [{
-        "title": p.get("title") or "Untitled",
-        "summary": (p.get("abstract") or "No abstract available").strip(),
-        "url": p.get("url", "")
     } for p in papers]
-def search_duckduckgo(query, max_results=3):
-    with DDGS() as ddgs:
-        return [{
-            "title": r["title"] or "Untitled",
-            "snippet": r["body"] or "",
-            "url": r["href"] or ""
-        } for r in ddgs.text(query, max_results=max_results)]
-def get_image_urls(query, max_images=3):
-    with DDGS() as ddgs:
-        return [img["image"] for img in ddgs.images(query, max_results=max_images)]
-def generate_apa_citation(title, url, source=""):
-    current_year = datetime.datetime.now().year
-    if source == "arxiv":
-        return f"{title}. ({current_year}). *arXiv*. {url}"
-    elif source == "semantic":
-        return f"{title}. ({current_year}). *Semantic Scholar*. {url}"
-    elif source == "web":
-        return f"{title}. ({current_year}). *Web Source*. {url}"
-    else:
-        return f"{title}. ({current_year}). {url}"
-# --- Research Agent ---
-def autonomous_research_agent(topic):
-    arxiv = get_arxiv_papers(topic)
-    scholar = get_semantic_scholar_papers(topic)
-    web = search_duckduckgo(topic)
-    images = get_image_urls(topic)
-    arxiv_md, arxiv_citations = "", []
-    for p in arxiv:
-        arxiv_md += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
-        arxiv_citations.append(generate_apa_citation(p["title"], p["url"], source="arxiv"))
-    scholar_md, scholar_citations = "", []
-    for p in scholar:
-        scholar_md += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
-        scholar_citations.append(generate_apa_citation(p["title"], p["url"], source="semantic"))
-    web_md, web_citations = "", []
-    for w in web:
-        web_md += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
-        web_citations.append(generate_apa_citation(w["title"], w["url"], source="web"))
-    prompt = f"""
-# Research Topic: {topic}
-## ArXiv:
-{arxiv_md}
-## Semantic Scholar:
-{scholar_md}
-## Web Insights:
-{web_md}
-Now synthesize this information into:
-1. A research gap
-2. A novel research direction
-3. A full markdown-formatted research article (continuous, no section labels, academic tone)
-"""
-    response = call_llm([{"role": "user", "content": prompt}], max_tokens=3000)
-    # Append Sources
-    response += "\n\n---\n### Sources Cited\n"
-    if arxiv_md:
-        response += "**ArXiv:**\n" + arxiv_md
-    if scholar_md:
-        response += "**Semantic Scholar:**\n" + scholar_md
-    if web_md:
-        response += "**Web:**\n" + web_md
-    # APA Citations Section
-    all_citations = arxiv_citations + scholar_citations + web_citations
-    response += "\n---\n### 📚 APA Citations\n"
-    for cite in all_citations:
-        response += f"- {cite}\n"
-    return response, images
 # --- Streamlit UI ---
-st.set_page_config("Autonomous Research Assistant", layout="wide")
-st.title("🤖 Autonomous AI Research Assistant")
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
-topic = st.text_input("Enter a research topic:")
-if st.button("Run Research Agent"):
-    with st.spinner("Gathering sources & thinking..."):
-        try:
-            response, images = autonomous_research_agent(topic)
-            # Display images
-            if images:
-                st.subheader("🖼️ Relevant Images")
-                st.image(images, width=300)
-            # Display markdown response
-            st.session_state.chat_history.append({"role": "user", "content": topic})
-            st.session_state.chat_history.append({"role": "assistant", "content": response})
-            st.markdown(response)
-            # Check for plagiarism
-            plagiarism_hits = check_plagiarism(response, topic)
-            if plagiarism_hits:
-                st.warning("⚠️ Potential overlap with existing web content detected.")
-                st.subheader("🕵️ Plagiarism Check Results")
-                for hit in plagiarism_hits:
-                    st.markdown(f"**{hit['title']}** - [{hit['url']}]({hit['url']})")
-                    st.markdown(f"> _Similarity: {hit['similarity']}%_\n\n{hit['snippet']}")
             else:
-                st.success("✅ No significant overlaps found. Content appears original.")
-        except Exception as e:
-            st.error(f"Failed: {e}")
-# --- Follow-up Chat ---
-st.divider()
-st.subheader("💬 Follow-up Q&A")
-followup = st.text_input("Ask a follow-up question:")
-if st.button("Ask"):
-    if followup:
-        try:
-            chat = st.session_state.chat_history + [{"role": "user", "content": followup}]
-            answer = call_llm(chat, max_tokens=1500)
-            st.session_state.chat_history.append({"role": "user", "content": followup})
-            st.session_state.chat_history.append({"role": "assistant", "content": answer})
-            st.markdown(answer)
-        except Exception as e:
-            st.error(f"Follow-up error: {e}")

 import os
 import streamlit as st
 import requests
 import datetime
 from dotenv import load_dotenv
+from tavily import TavilyClient
+import feedparser
+import time
+from fuzzywuzzy import fuzz
+# Load environment variables
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
+tavily = TavilyClient(api_key=TAVILY_API_KEY)
+# --- Helper Functions ---
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
         "Authorization": f"Bearer {OPENROUTER_API_KEY}",
         "Content-Type": "application/json",
+        "X-Title": "GPT Deep Research Agent"
     }
     data = {
         "model": model,
         "max_tokens": max_tokens,
         "temperature": temperature
     }
+    response = requests.post(url, headers=headers, json=data)
+    result = response.json()
     if response.status_code != 200:
         raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
     return result["choices"][0]["message"]["content"]
+def get_sources(topic, domains):
+    query = topic
+    if domains and isinstance(domains, str):
+        domain_list = [d.strip() for d in domains.split(",") if d.strip()]
+        if domain_list:
+            query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
+    results = tavily.search(query=query, search_depth="advanced", max_results=5)
+    return [{
+        "title": r.get("title", "Untitled"),
+        "snippet": r.get("content", ""),
+        "url": r.get("url", "")
+    } for r in results.get("results", [])]
+def get_arxiv_papers(query):
     from urllib.parse import quote_plus
+    url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=3"
     feed = feedparser.parse(url)
     return [{
+        "title": e.title,
+        "summary": e.summary.replace("\n", " ").strip(),
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
+def get_semantic_papers(query):
     url = "https://api.semanticscholar.org/graph/v1/paper/search"
+    params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
     response = requests.get(url, params=params)
     papers = response.json().get("data", [])
     return [{
+        "title": p.get("title"),
+        "summary": p.get("abstract", "No abstract available"),
+        "url": p.get("url")
     } for p in papers]
+def check_plagiarism(text, topic):
+    hits = []
+    for r in get_sources(topic, ""):
+        similarity = fuzz.token_set_ratio(text, r["snippet"])
+        if similarity >= 75:
+            hits.append(r)
+    return hits
+def generate_apa_citation(title, url, source):
+    year = datetime.datetime.now().year
+    label = {
+        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
+    }.get(source, "*Web*")
+    return f"{title}. ({year}). {label}. {url}"
 # --- Streamlit UI ---
+st.set_page_config("Deep Research Bot", layout="wide")
+st.title("🤖 Real-time Deep Research Agent (Tavily Edition)")
+st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily.")
+topic = st.text_input("💡 What would you like me to research next?")
+report_type = st.selectbox("📄 Type of report", [
+    "Summary - Short and fast (~2 min)",
+    "Detailed Report (~5 min)",
+    "Thorough Academic Research (~10 min)"
+])
+tone = st.selectbox("🎯 Tone of the report", [
+    "Objective - Impartial and unbiased presentation of facts and findings",
+    "Persuasive - Advocating a specific point of view",
+    "Narrative - Storytelling tone for layperson readers"
+])
+source_type = st.selectbox("🌐 Sources to include", [
+    "Web Only", "Academic Only", "Hybrid"
+])
+custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
+if st.button("Research"):
+    try:
+        with st.status("Starting agent tasks..."):
+            st.info("🧠 Thinking through research questions...")
+            time.sleep(1)
+            st.info("🌐 Fetching data from selected sources...")
+            all_data, citations = "", []
+            if source_type in ["Web Only", "Hybrid"]:
+                web = get_sources(topic, custom_domains)
+                for w in web:
+                    all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
+                    citations.append(generate_apa_citation(w["title"], w["url"], "web"))
+            if source_type in ["Academic Only", "Hybrid"]:
+                arxiv = get_arxiv_papers(topic)
+                for p in arxiv:
+                    all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
+                    citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
+                scholar = get_semantic_papers(topic)
+                for s in scholar:
+                    all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
+                    citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
+            st.success("Data collection complete!")
+        with st.spinner("📝 Writing final research report..."):
+            prompt = f"""
+# Research Task: {topic}
+Tone: {tone}
+Report Type: {report_type}
+Sources:
+{all_data}
+Now, synthesize:
+1. Research questions and gap
+2. A novel insight or direction
+3. A real-world application scenario
+4. A {report_type.lower()} in academic markdown (no headings)
+            """
+            output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
+        st.subheader("📄 Research Report")
+        st.markdown(output, unsafe_allow_html=True)
+        st.markdown("### 📚 APA Citations")
+        for c in citations:
+            st.markdown(f"- {c}")
+        with st.spinner("🧪 Checking for overlaps..."):
+            overlaps = check_plagiarism(output, topic)
+            if overlaps:
+                st.warning("⚠️ Potential content overlap found.")
+                for h in overlaps:
+                    st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
             else:
+                st.success("✅ No major overlaps detected.")
+    except Exception as e:
+        st.error(f"Error: {e}")