Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,22 +3,32 @@ import os
|
|
3 |
import streamlit as st
|
4 |
import requests
|
5 |
import datetime
|
|
|
|
|
6 |
from dotenv import load_dotenv
|
7 |
from duckduckgo_search import DDGS
|
8 |
-
import feedparser
|
9 |
-
import time
|
10 |
from fuzzywuzzy import fuzz
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
load_dotenv()
|
13 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
# ---
|
16 |
def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
|
17 |
url = "https://openrouter.ai/api/v1/chat/completions"
|
18 |
headers = {
|
19 |
-
"Authorization": f"Bearer "
|
20 |
"Content-Type": "application/json",
|
21 |
-
"X-Title": "
|
22 |
}
|
23 |
data = {
|
24 |
"model": model,
|
@@ -34,7 +44,7 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
|
|
34 |
|
35 |
def get_sources(topic, domains):
|
36 |
query = topic
|
37 |
-
if domains
|
38 |
domain_list = [d.strip() for d in domains.split(",") if d.strip()]
|
39 |
if domain_list:
|
40 |
query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
|
@@ -55,17 +65,6 @@ def get_arxiv_papers(query):
|
|
55 |
"url": next((l.href for l in e.links if l.type == "application/pdf"), "")
|
56 |
} for e in feed.entries]
|
57 |
|
58 |
-
def get_semantic_papers(query):
|
59 |
-
url = "https://api.semanticscholar.org/graph/v1/paper/search"
|
60 |
-
params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
|
61 |
-
response = requests.get(url, params=params)
|
62 |
-
papers = response.json().get("data", [])
|
63 |
-
return [{
|
64 |
-
"title": p.get("title"),
|
65 |
-
"summary": p.get("abstract", "No abstract available"),
|
66 |
-
"url": p.get("url")
|
67 |
-
} for p in papers]
|
68 |
-
|
69 |
def check_plagiarism(text, topic):
|
70 |
hits = []
|
71 |
for r in get_sources(topic, ""):
|
@@ -77,90 +76,89 @@ def check_plagiarism(text, topic):
|
|
77 |
def generate_apa_citation(title, url, source):
|
78 |
year = datetime.datetime.now().year
|
79 |
label = {
|
80 |
-
"arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
|
81 |
}.get(source, "*Web*")
|
82 |
return f"{title}. ({year}). {label}. {url}"
|
83 |
|
84 |
-
# ---
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
report_type = st.selectbox("π Type of report", [
|
92 |
-
"Summary - Short and fast
|
93 |
-
"Detailed Report (~5 min)",
|
94 |
-
"Thorough Academic Research (~10 min)"
|
95 |
])
|
96 |
-
tone = st.selectbox("π― Tone
|
97 |
-
"Objective
|
98 |
-
"Persuasive - Advocating a specific point of view",
|
99 |
-
"Narrative - Storytelling tone for layperson readers"
|
100 |
])
|
101 |
-
source_type = st.selectbox("π Sources
|
102 |
-
|
103 |
-
|
104 |
-
custom_domains = st.text_input("π Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
|
105 |
|
106 |
-
if st.button("Research"):
|
107 |
try:
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
if source_type in ["Academic Only", "Hybrid"]:
|
120 |
-
arxiv = get_arxiv_papers(topic)
|
121 |
-
for p in arxiv:
|
122 |
-
all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
|
123 |
citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
|
124 |
-
scholar = get_semantic_papers(topic)
|
125 |
-
for s in scholar:
|
126 |
-
all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
|
127 |
-
citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
|
128 |
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
-
with st.spinner("π Writing final
|
132 |
prompt = f"""
|
133 |
-
|
134 |
-
|
135 |
Tone: {tone}
|
136 |
Report Type: {report_type}
|
137 |
|
138 |
Sources:
|
139 |
-
{
|
140 |
|
141 |
-
Now
|
142 |
-
1. Research
|
143 |
-
2.
|
144 |
-
3.
|
145 |
-
4.
|
146 |
-
|
147 |
-
|
148 |
|
149 |
st.subheader("π Research Report")
|
150 |
-
st.markdown(
|
151 |
-
|
152 |
st.markdown("### π APA Citations")
|
153 |
for c in citations:
|
154 |
st.markdown(f"- {c}")
|
155 |
|
156 |
-
with st.spinner("
|
157 |
-
overlaps = check_plagiarism(
|
158 |
if overlaps:
|
159 |
-
st.warning("β οΈ
|
160 |
for h in overlaps:
|
161 |
-
st.markdown(f"**{h['title']}**
|
162 |
else:
|
163 |
-
st.success("β
No
|
164 |
|
165 |
except Exception as e:
|
166 |
-
st.error(f"Error: {e}")
|
|
|
3 |
import streamlit as st
|
4 |
import requests
|
5 |
import datetime
|
6 |
+
import time
|
7 |
+
import feedparser
|
8 |
from dotenv import load_dotenv
|
9 |
from duckduckgo_search import DDGS
|
|
|
|
|
10 |
from fuzzywuzzy import fuzz
|
11 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
12 |
+
from langchain.vectorstores import FAISS
|
13 |
+
from langchain.document_loaders import TextLoader, PyPDFLoader
|
14 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
15 |
+
from langchain.chains import RetrievalQA
|
16 |
+
from langchain.chat_models import ChatOpenAI
|
17 |
|
18 |
load_dotenv()
|
19 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
21 |
+
|
22 |
+
st.set_page_config("Advanced RAG Research Agent", layout="wide")
|
23 |
+
st.title("π§ Advanced Deep Research Agent (RAG + Documents + Real-time)")
|
24 |
|
25 |
+
# --- Core Utilities ---
|
26 |
def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
|
27 |
url = "https://openrouter.ai/api/v1/chat/completions"
|
28 |
headers = {
|
29 |
+
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
30 |
"Content-Type": "application/json",
|
31 |
+
"X-Title": "RAG Deep Research Agent"
|
32 |
}
|
33 |
data = {
|
34 |
"model": model,
|
|
|
44 |
|
45 |
def get_sources(topic, domains):
|
46 |
query = topic
|
47 |
+
if domains:
|
48 |
domain_list = [d.strip() for d in domains.split(",") if d.strip()]
|
49 |
if domain_list:
|
50 |
query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
|
|
|
65 |
"url": next((l.href for l in e.links if l.type == "application/pdf"), "")
|
66 |
} for e in feed.entries]
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
def check_plagiarism(text, topic):
|
69 |
hits = []
|
70 |
for r in get_sources(topic, ""):
|
|
|
76 |
def generate_apa_citation(title, url, source):
|
77 |
year = datetime.datetime.now().year
|
78 |
label = {
|
79 |
+
"arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*", "local": "*Uploaded Document*"
|
80 |
}.get(source, "*Web*")
|
81 |
return f"{title}. ({year}). {label}. {url}"
|
82 |
|
83 |
+
# --- RAG Processing from Uploaded Documents ---
|
84 |
+
def process_uploaded_docs(uploaded_files):
|
85 |
+
docs = []
|
86 |
+
for file in uploaded_files:
|
87 |
+
ext = file.name.split(".")[-1].lower()
|
88 |
+
with open(f"/tmp/{file.name}", "wb") as f:
|
89 |
+
f.write(file.read())
|
90 |
+
loader = PyPDFLoader(f"/tmp/{file.name}") if ext == "pdf" else TextLoader(f"/tmp/{file.name}")
|
91 |
+
docs.extend(loader.load())
|
92 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
93 |
+
split_docs = splitter.split_documents(docs)
|
94 |
+
vectorstore = FAISS.from_documents(split_docs, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
|
95 |
+
return RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY), retriever=vectorstore.as_retriever())
|
96 |
+
|
97 |
+
# --- UI Layout ---
|
98 |
+
topic = st.text_input("π‘ Research Topic")
|
99 |
report_type = st.selectbox("π Type of report", [
|
100 |
+
"Summary - Short and fast", "Detailed Report", "Thorough Academic Research"
|
|
|
|
|
101 |
])
|
102 |
+
tone = st.selectbox("π― Tone", [
|
103 |
+
"Objective", "Persuasive", "Narrative"
|
|
|
|
|
104 |
])
|
105 |
+
source_type = st.selectbox("π Sources", ["Web", "Documents", "Hybrid"])
|
106 |
+
custom_domains = st.text_input("π Query Domains (optional)", placeholder="example.com, site.org")
|
107 |
+
uploaded_files = st.file_uploader("π Upload PDFs or text documents", accept_multiple_files=True)
|
|
|
108 |
|
109 |
+
if st.button("π Start Research"):
|
110 |
try:
|
111 |
+
collected, citations = "", []
|
112 |
+
st.status("π§ Agent initializing...")
|
113 |
+
|
114 |
+
if source_type in ["Web", "Hybrid"]:
|
115 |
+
with st.spinner("π Collecting web and arXiv data..."):
|
116 |
+
sources = get_sources(topic, custom_domains)
|
117 |
+
for s in sources:
|
118 |
+
collected += f"- [{s['title']}]({s['url']})\n> {s['snippet']}\n\n"
|
119 |
+
citations.append(generate_apa_citation(s["title"], s["url"], "web"))
|
120 |
+
for p in get_arxiv_papers(topic):
|
121 |
+
collected += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
|
|
|
|
|
|
|
|
|
122 |
citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
|
|
|
|
|
|
|
|
|
123 |
|
124 |
+
if source_type in ["Documents", "Hybrid"] and uploaded_files:
|
125 |
+
with st.spinner("π Processing documents with RAG..."):
|
126 |
+
rag_chain = process_uploaded_docs(uploaded_files)
|
127 |
+
rag_output = rag_chain.run(f"Summarize everything about: {topic}")
|
128 |
+
collected += f"\n**Document Insights:**\n\n{rag_output}\n"
|
129 |
+
citations.append(generate_apa_citation("Uploaded Materials", "local", "local"))
|
130 |
|
131 |
+
with st.spinner("π Writing final report..."):
|
132 |
prompt = f"""
|
133 |
+
Topic: {topic}
|
|
|
134 |
Tone: {tone}
|
135 |
Report Type: {report_type}
|
136 |
|
137 |
Sources:
|
138 |
+
{collected}
|
139 |
|
140 |
+
Now generate:
|
141 |
+
1. Research gap
|
142 |
+
2. Novel direction
|
143 |
+
3. Real-world example
|
144 |
+
4. Full article in markdown format
|
145 |
+
"""
|
146 |
+
response = call_llm([{"role": "user", "content": prompt}], max_tokens=3000)
|
147 |
|
148 |
st.subheader("π Research Report")
|
149 |
+
st.markdown(response, unsafe_allow_html=True)
|
|
|
150 |
st.markdown("### π APA Citations")
|
151 |
for c in citations:
|
152 |
st.markdown(f"- {c}")
|
153 |
|
154 |
+
with st.spinner("π Checking for plagiarism..."):
|
155 |
+
overlaps = check_plagiarism(response, topic)
|
156 |
if overlaps:
|
157 |
+
st.warning("β οΈ Content overlap found")
|
158 |
for h in overlaps:
|
159 |
+
st.markdown(f"**{h['title']}** β [{h['url']}]({h['url']})")
|
160 |
else:
|
161 |
+
st.success("β
No overlap detected")
|
162 |
|
163 |
except Exception as e:
|
164 |
+
st.error(f"π¨ Error: {e}")
|