Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,32 +3,22 @@ import os
|
|
3 |
import streamlit as st
|
4 |
import requests
|
5 |
import datetime
|
6 |
-
import time
|
7 |
-
import feedparser
|
8 |
from dotenv import load_dotenv
|
9 |
from duckduckgo_search import DDGS
|
|
|
|
|
10 |
from fuzzywuzzy import fuzz
|
11 |
-
from langchain.embeddings.openai import OpenAIEmbeddings
|
12 |
-
from langchain.vectorstores import FAISS
|
13 |
-
from langchain.document_loaders import TextLoader, PyPDFLoader
|
14 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
15 |
-
from langchain.chains import RetrievalQA
|
16 |
-
from langchain.chat_models import ChatOpenAI
|
17 |
|
18 |
load_dotenv()
|
19 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
20 |
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
21 |
-
|
22 |
-
st.set_page_config("Advanced RAG Research Agent", layout="wide")
|
23 |
-
st.title("π§ Advanced Deep Research Agent (RAG + Documents + Real-time)")
|
24 |
|
25 |
-
# ---
|
26 |
def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
|
27 |
url = "https://openrouter.ai/api/v1/chat/completions"
|
28 |
headers = {
|
29 |
-
"Authorization": f"Bearer
|
30 |
"Content-Type": "application/json",
|
31 |
-
"X-Title": "
|
32 |
}
|
33 |
data = {
|
34 |
"model": model,
|
@@ -43,17 +33,12 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
|
|
43 |
return result["choices"][0]["message"]["content"]
|
44 |
|
45 |
def get_sources(topic, domains):
|
46 |
-
query = topic
|
47 |
-
if domains:
|
48 |
-
domain_list = [d.strip() for d in domains.split(",") if d.strip()]
|
49 |
-
if domain_list:
|
50 |
-
query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
|
51 |
with DDGS() as ddgs:
|
52 |
return [{
|
53 |
"title": r.get("title", "Untitled"),
|
54 |
"snippet": r.get("body", ""),
|
55 |
"url": r.get("href", "")
|
56 |
-
} for r in ddgs.text(
|
57 |
|
58 |
def get_arxiv_papers(query):
|
59 |
from urllib.parse import quote_plus
|
@@ -65,6 +50,17 @@ def get_arxiv_papers(query):
|
|
65 |
"url": next((l.href for l in e.links if l.type == "application/pdf"), "")
|
66 |
} for e in feed.entries]
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
def check_plagiarism(text, topic):
|
69 |
hits = []
|
70 |
for r in get_sources(topic, ""):
|
@@ -76,89 +72,90 @@ def check_plagiarism(text, topic):
|
|
76 |
def generate_apa_citation(title, url, source):
|
77 |
year = datetime.datetime.now().year
|
78 |
label = {
|
79 |
-
"arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
|
80 |
}.get(source, "*Web*")
|
81 |
return f"{title}. ({year}). {label}. {url}"
|
82 |
|
83 |
-
# ---
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
loader = PyPDFLoader(f"/tmp/{file.name}") if ext == "pdf" else TextLoader(f"/tmp/{file.name}")
|
91 |
-
docs.extend(loader.load())
|
92 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
93 |
-
split_docs = splitter.split_documents(docs)
|
94 |
-
vectorstore = FAISS.from_documents(split_docs, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
|
95 |
-
return RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY), retriever=vectorstore.as_retriever())
|
96 |
-
|
97 |
-
# --- UI Layout ---
|
98 |
-
topic = st.text_input("π‘ Research Topic")
|
99 |
report_type = st.selectbox("π Type of report", [
|
100 |
-
"Summary - Short and fast
|
|
|
|
|
101 |
])
|
102 |
-
tone = st.selectbox("π― Tone", [
|
103 |
-
"Objective
|
|
|
|
|
104 |
])
|
105 |
-
source_type = st.selectbox("π Sources
|
106 |
-
|
107 |
-
|
|
|
108 |
|
109 |
-
if st.button("
|
110 |
try:
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
122 |
citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
125 |
-
with st.spinner("π Processing documents with RAG..."):
|
126 |
-
rag_chain = process_uploaded_docs(uploaded_files)
|
127 |
-
rag_output = rag_chain.run(f"Summarize everything about: {topic}")
|
128 |
-
collected += f"\n**Document Insights:**\n\n{rag_output}\n"
|
129 |
-
citations.append(generate_apa_citation("Uploaded Materials", "local", "local"))
|
130 |
|
131 |
-
with st.spinner("π Writing final report..."):
|
132 |
prompt = f"""
|
133 |
-
|
|
|
134 |
Tone: {tone}
|
135 |
Report Type: {report_type}
|
136 |
|
137 |
Sources:
|
138 |
-
{
|
139 |
|
140 |
-
Now
|
141 |
-
1. Research gap
|
142 |
-
2.
|
143 |
-
3.
|
144 |
-
4.
|
145 |
-
"""
|
146 |
-
|
147 |
|
148 |
st.subheader("π Research Report")
|
149 |
-
st.markdown(
|
|
|
150 |
st.markdown("### π APA Citations")
|
151 |
for c in citations:
|
152 |
st.markdown(f"- {c}")
|
153 |
|
154 |
-
with st.spinner("
|
155 |
-
overlaps = check_plagiarism(
|
156 |
if overlaps:
|
157 |
-
st.warning("β οΈ
|
158 |
for h in overlaps:
|
159 |
-
st.markdown(f"**{h['title']}**
|
160 |
else:
|
161 |
-
st.success("β
No
|
162 |
|
163 |
except Exception as e:
|
164 |
-
st.error(f"
|
|
|
3 |
import streamlit as st
|
4 |
import requests
|
5 |
import datetime
|
|
|
|
|
6 |
from dotenv import load_dotenv
|
7 |
from duckduckgo_search import DDGS
|
8 |
+
import feedparser
|
9 |
+
import time
|
10 |
from fuzzywuzzy import fuzz
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
load_dotenv()
|
13 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
# --- Helper Functions ---
|
16 |
def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
|
17 |
url = "https://openrouter.ai/api/v1/chat/completions"
|
18 |
headers = {
|
19 |
+
"Authorization": f"Bearer " + OPENROUTER_API_KEY,
|
20 |
"Content-Type": "application/json",
|
21 |
+
"X-Title": "GPT Deep Research Agent"
|
22 |
}
|
23 |
data = {
|
24 |
"model": model,
|
|
|
33 |
return result["choices"][0]["message"]["content"]
|
34 |
|
35 |
def get_sources(topic, domains):
|
|
|
|
|
|
|
|
|
|
|
36 |
with DDGS() as ddgs:
|
37 |
return [{
|
38 |
"title": r.get("title", "Untitled"),
|
39 |
"snippet": r.get("body", ""),
|
40 |
"url": r.get("href", "")
|
41 |
+
} for r in ddgs.text(topic + " site:" + domains if domains else topic, max_results=5)]
|
42 |
|
43 |
def get_arxiv_papers(query):
|
44 |
from urllib.parse import quote_plus
|
|
|
50 |
"url": next((l.href for l in e.links if l.type == "application/pdf"), "")
|
51 |
} for e in feed.entries]
|
52 |
|
53 |
+
def get_semantic_papers(query):
|
54 |
+
url = "https://api.semanticscholar.org/graph/v1/paper/search"
|
55 |
+
params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
|
56 |
+
response = requests.get(url, params=params)
|
57 |
+
papers = response.json().get("data", [])
|
58 |
+
return [{
|
59 |
+
"title": p.get("title"),
|
60 |
+
"summary": p.get("abstract", "No abstract available"),
|
61 |
+
"url": p.get("url")
|
62 |
+
} for p in papers]
|
63 |
+
|
64 |
def check_plagiarism(text, topic):
|
65 |
hits = []
|
66 |
for r in get_sources(topic, ""):
|
|
|
72 |
def generate_apa_citation(title, url, source):
|
73 |
year = datetime.datetime.now().year
|
74 |
label = {
|
75 |
+
"arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
|
76 |
}.get(source, "*Web*")
|
77 |
return f"{title}. ({year}). {label}. {url}"
|
78 |
|
79 |
+
# --- Streamlit UI ---
|
80 |
+
st.set_page_config("Deep Research Bot", layout="wide")
|
81 |
+
st.title("π€ Real-time Deep Research Agent")
|
82 |
+
|
83 |
+
st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time.")
|
84 |
+
|
85 |
+
topic = st.text_input("π‘ What would you like me to research next?")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
report_type = st.selectbox("π Type of report", [
|
87 |
+
"Summary - Short and fast (~2 min)",
|
88 |
+
"Detailed Report (~5 min)",
|
89 |
+
"Thorough Academic Research (~10 min)"
|
90 |
])
|
91 |
+
tone = st.selectbox("π― Tone of the report", [
|
92 |
+
"Objective - Impartial and unbiased presentation of facts and findings",
|
93 |
+
"Persuasive - Advocating a specific point of view",
|
94 |
+
"Narrative - Storytelling tone for layperson readers"
|
95 |
])
|
96 |
+
source_type = st.selectbox("π Sources to include", [
|
97 |
+
"Web Only", "Academic Only", "Hybrid"
|
98 |
+
])
|
99 |
+
custom_domains = st.text_input("π Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
|
100 |
|
101 |
+
if st.button("Research"):
|
102 |
try:
|
103 |
+
with st.status("Starting agent tasks..."):
|
104 |
+
st.info("π§ Thinking through research questions...")
|
105 |
+
time.sleep(1)
|
106 |
+
st.info("π Fetching data from selected sources...")
|
107 |
+
|
108 |
+
all_data, citations = "", []
|
109 |
+
if source_type in ["Web Only", "Hybrid"]:
|
110 |
+
web = get_sources(topic, custom_domains)
|
111 |
+
for w in web:
|
112 |
+
all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
|
113 |
+
citations.append(generate_apa_citation(w["title"], w["url"], "web"))
|
114 |
+
if source_type in ["Academic Only", "Hybrid"]:
|
115 |
+
arxiv = get_arxiv_papers(topic)
|
116 |
+
for p in arxiv:
|
117 |
+
all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
|
118 |
citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
|
119 |
+
scholar = get_semantic_papers(topic)
|
120 |
+
for s in scholar:
|
121 |
+
all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
|
122 |
+
citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
|
123 |
|
124 |
+
st.success("Data collection complete!")
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
with st.spinner("π Writing final research report..."):
|
127 |
prompt = f"""
|
128 |
+
# Research Task: {topic}
|
129 |
+
|
130 |
Tone: {tone}
|
131 |
Report Type: {report_type}
|
132 |
|
133 |
Sources:
|
134 |
+
{all_data}
|
135 |
|
136 |
+
Now, synthesize:
|
137 |
+
1. Research questions and gap
|
138 |
+
2. A novel insight or direction
|
139 |
+
3. A real-world application scenario
|
140 |
+
4. A {report_type.lower()} in academic markdown (no headings)
|
141 |
+
"""
|
142 |
+
output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
|
143 |
|
144 |
st.subheader("π Research Report")
|
145 |
+
st.markdown(output, unsafe_allow_html=True)
|
146 |
+
|
147 |
st.markdown("### π APA Citations")
|
148 |
for c in citations:
|
149 |
st.markdown(f"- {c}")
|
150 |
|
151 |
+
with st.spinner("π§ͺ Checking for overlaps..."):
|
152 |
+
overlaps = check_plagiarism(output, topic)
|
153 |
if overlaps:
|
154 |
+
st.warning("β οΈ Potential content overlap found.")
|
155 |
for h in overlaps:
|
156 |
+
st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
|
157 |
else:
|
158 |
+
st.success("β
No major overlaps detected.")
|
159 |
|
160 |
except Exception as e:
|
161 |
+
st.error(f"Error: {e}")
|