Spaces:

Ani14
/

AutoReasearcher

Sleeping

App Files Files Community

Ani14 commited on Apr 21

Commit

7d54951

verified ·

1 Parent(s): 4083831

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -144

app.py CHANGED Viewed

@@ -3,44 +3,38 @@ import os
 import streamlit as st
 import requests
 import datetime
-from dotenv import load_dotenv
-from duckduckgo_search import DDGS
 import feedparser
-import time
-from fuzzywuzzy import fuzz
 load_dotenv()
-OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
-# --- Helper Functions ---
-def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
-    url = "https://openrouter.ai/api/v1/chat/completions"
-    headers = {
-        "Authorization": f"Bearer " + OPENROUTER_API_KEY,
-        "Content-Type": "application/json",
-        "X-Title": "GPT Deep Research Agent"
-    }
-    data = {
-        "model": model,
-        "messages": messages,
-        "max_tokens": max_tokens,
-        "temperature": temperature
-    }
-    response = requests.post(url, headers=headers, json=data)
-    result = response.json()
-    if response.status_code != 200:
-        raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
-    return result["choices"][0]["message"]["content"]
-def get_sources(topic, domains):
-    with DDGS() as ddgs:
-        return [{
-            "title": r.get("title", "Untitled"),
-            "snippet": r.get("body", ""),
-            "url": r.get("href", "")
-        } for r in ddgs.text(topic + " site:" + domains if domains else topic, max_results=5)]
-def get_arxiv_papers(query):
     from urllib.parse import quote_plus
     url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=3"
     feed = feedparser.parse(url)
@@ -50,112 +44,106 @@ def get_arxiv_papers(query):
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
-def get_semantic_papers(query):
-    url = "https://api.semanticscholar.org/graph/v1/paper/search"
-    params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
-    response = requests.get(url, params=params)
-    papers = response.json().get("data", [])
-    return [{
-        "title": p.get("title"),
-        "summary": p.get("abstract", "No abstract available"),
-        "url": p.get("url")
-    } for p in papers]
-def check_plagiarism(text, topic):
-    hits = []
-    for r in get_sources(topic, ""):
-        similarity = fuzz.token_set_ratio(text, r["snippet"])
-        if similarity >= 75:
-            hits.append(r)
-    return hits
-def generate_apa_citation(title, url, source):
-    year = datetime.datetime.now().year
-    label = {
-        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
-    }.get(source, "*Web*")
-    return f"{title}. ({year}). {label}. {url}"
-# --- Streamlit UI ---
-st.set_page_config("Deep Research Bot", layout="wide")
-st.title("🤖 Real-time Deep Research Agent")
-st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time.")
-topic = st.text_input("💡 What would you like me to research next?")
-report_type = st.selectbox("📄 Type of report", [
-    "Summary - Short and fast (~2 min)",
-    "Detailed Report (~5 min)",
-    "Thorough Academic Research (~10 min)"
-])
-tone = st.selectbox("🎯 Tone of the report", [
-    "Objective - Impartial and unbiased presentation of facts and findings",
-    "Persuasive - Advocating a specific point of view",
-    "Narrative - Storytelling tone for layperson readers"
-])
-source_type = st.selectbox("🌐 Sources to include", [
-    "Web Only", "Academic Only", "Hybrid"
-])
-custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
-if st.button("Research"):
-    try:
-        with st.status("Starting agent tasks..."):
-            st.info("🧠 Thinking through research questions...")
-            time.sleep(1)
-            st.info("🌐 Fetching data from selected sources...")
-            all_data, citations = "", []
-            if source_type in ["Web Only", "Hybrid"]:
-                web = get_sources(topic, custom_domains)
-                for w in web:
-                    all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
-                    citations.append(generate_apa_citation(w["title"], w["url"], "web"))
-            if source_type in ["Academic Only", "Hybrid"]:
-                arxiv = get_arxiv_papers(topic)
-                for p in arxiv:
-                    all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
-                    citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
-                scholar = get_semantic_papers(topic)
-                for s in scholar:
-                    all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
-                    citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
-            st.success("Data collection complete!")
-        with st.spinner("📝 Writing final research report..."):
-            prompt = f"""
-# Research Task: {topic}
-Tone: {tone}
-Report Type: {report_type}
-Sources:
-{all_data}
-Now, synthesize:
-1. Research questions and gap
-2. A novel insight or direction
-3. A real-world application scenario
-4. A {report_type.lower()} in academic markdown (no headings)
-            """
-            output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
-        st.subheader("📄 Research Report")
-        st.markdown(output, unsafe_allow_html=True)
-        st.markdown("### 📚 APA Citations")
-        for c in citations:
-            st.markdown(f"- {c}")
-        with st.spinner("🧪 Checking for overlaps..."):
-            overlaps = check_plagiarism(output, topic)
-            if overlaps:
-                st.warning("⚠️ Potential content overlap found.")
-                for h in overlaps:
-                    st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
-            else:
-                st.success("✅ No major overlaps detected.")
-    except Exception as e:
-        st.error(f"Error: {e}")

 import streamlit as st
 import requests
 import datetime
+import openai
 import feedparser
+from dotenv import load_dotenv
+from tavily import TavilyClient
+from PyPDF2 import PdfReader
+import faiss
+import numpy as np
+# --- Load API Keys ---
 load_dotenv()
+openai.api_key = os.getenv("OPENAI_API_KEY")
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
+tavily = TavilyClient(api_key=TAVILY_API_KEY)
+# --- Streamlit Config ---
+st.set_page_config(page_title="GPT Researcher Agent", layout="wide")
+st.title("📚 GPT-Powered Research Assistant")
+# --- Helper: APA Citation ---
+def generate_apa_citation(title, url, source):
+    year = datetime.datetime.now().year
+    label = {
+        "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web*"
+    }.get(source, "*Web*")
+    return f"{title}. ({year}). {label}. {url}"
+# --- Search Tools ---
+def tavily_search(query):
+    results = tavily.search(query, search_depth="advanced", max_results=5)
+    return results.get("results", [])
+def arxiv_search(query):
     from urllib.parse import quote_plus
     url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=3"
     feed = feedparser.parse(url)
         "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
     } for e in feed.entries]
+# --- Document Embedding ---
+def embed_document(file):
+    doc_text = ""
+    if file.name.endswith(".pdf"):
+        reader = PdfReader(file)
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:
+                doc_text += text
+    else:
+        doc_text = file.read().decode("utf-8")
+    chunks = [doc_text[i:i+1000] for i in range(0, len(doc_text), 1000)]
+    embeddings = openai.Embedding.create(input=chunks, model="text-embedding-ada-002")
+    vectors = [np.array(rec["embedding"], dtype=np.float32) for rec in embeddings["data"]]
+    dim = len(vectors[0])
+    index = faiss.IndexFlatL2(dim)
+    index.add(np.vstack(vectors))
+    return chunks, index
+# --- Streaming GPT Call ---
+def stream_response(messages):
+    response = openai.ChatCompletion.create(
+        model="gpt-4",
+        messages=messages,
+        max_tokens=3000,
+        stream=True
+    )
+    collected = ""
+    placeholder = st.empty()
+    for chunk in response:
+        delta = chunk["choices"][0].get("delta", {})
+        if "content" in delta:
+            token = delta["content"]
+            collected += token
+            placeholder.markdown(collected + "▌")
+    placeholder.markdown(collected)
+    return collected
+# --- Sidebar Input ---
+with st.sidebar:
+    topic = st.text_input("🔍 Research Topic", "AI in Sustainable Agriculture")
+    report_type = st.selectbox("📄 Report Type", ["Summary", "Detailed", "Academic Paper"])
+    tone = st.selectbox("🎯 Tone", ["Objective", "Scientific", "Persuasive"])
+    sources = st.selectbox("🌐 Sources", ["Web", "Documents", "Both"])
+    uploaded_file = st.file_uploader("📎 Upload Document (PDF/TXT)", type=["pdf", "txt"])
+    start_button = st.button("🚀 Run Research")
+# --- Main Agent Execution ---
+if start_button and topic:
+    st.subheader("🧠 Agent Log")
+    with st.container():
+        st.markdown("<div style='max-height:300px; overflow-y:auto; background:#222; padding:10px; border-radius:10px;'>", unsafe_allow_html=True)
+        st.markdown("🧭 Starting research task...")
+        st.markdown(f"🔎 Topic: **{topic}** | Tone: _{tone}_ | Type: _{report_type}_")
+        st.markdown("</div>", unsafe_allow_html=True)
+    citations = []
+    context = ""
+    if sources in ["Web", "Both"]:
+        st.info("🌐 Searching web sources via Tavily...")
+        web_results = tavily_search(topic)
+        for r in web_results:
+            context += f"{r.get('content','')}
+"
+            citations.append(generate_apa_citation(r.get("title", "Untitled"), r.get("url", "#"), "web"))
+    if sources in ["Documents", "Both"] and uploaded_file:
+        st.info("📄 Embedding and retrieving from uploaded document...")
+        chunks, index = embed_document(uploaded_file)
+        q_embed = openai.Embedding.create(input=[topic], model="text-embedding-ada-002")
+        q_vector = np.array(q_embed["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)
+        D, I = index.search(q_vector, k=3)
+        for idx in I[0]:
+            context += chunks[idx] + "
+"
+        citations.append(generate_apa_citation(uploaded_file.name, "Uploaded", "local"))
+    st.info("✍️ Generating final research report...")
+    messages = [
+        {"role": "system", "content": f"You are a research assistant. Write a {report_type.lower()} in a {tone.lower()} tone, citing sources."},
+        {"role": "user", "content": f"Topic: {topic}
+Context:
+{context}
+Write a complete report in academic markdown format."}
+    ]
+    final_output = stream_response(messages)
+    # --- Show Output and Citations ---
+    st.subheader("📄 Final Report")
+    st.markdown(final_output, unsafe_allow_html=True)
+    st.subheader("📚 References")
+    for cite in citations:
+        st.markdown(f"- {cite}")
+    st.download_button("💾 Download Markdown", final_output, file_name="report.md", mime="text/markdown")