Ani14 commited on
Commit
4083831
Β·
verified Β·
1 Parent(s): b8ed084

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -79
app.py CHANGED
@@ -3,32 +3,22 @@ import os
3
  import streamlit as st
4
  import requests
5
  import datetime
6
- import time
7
- import feedparser
8
  from dotenv import load_dotenv
9
  from duckduckgo_search import DDGS
 
 
10
  from fuzzywuzzy import fuzz
11
- from langchain.embeddings.openai import OpenAIEmbeddings
12
- from langchain.vectorstores import FAISS
13
- from langchain.document_loaders import TextLoader, PyPDFLoader
14
- from langchain.text_splitter import RecursiveCharacterTextSplitter
15
- from langchain.chains import RetrievalQA
16
- from langchain.chat_models import ChatOpenAI
17
 
18
  load_dotenv()
19
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
20
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
-
22
- st.set_page_config("Advanced RAG Research Agent", layout="wide")
23
- st.title("🧠 Advanced Deep Research Agent (RAG + Documents + Real-time)")
24
 
25
- # --- Core Utilities ---
26
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
27
  url = "https://openrouter.ai/api/v1/chat/completions"
28
  headers = {
29
- "Authorization": f"Bearer {OPENROUTER_API_KEY}",
30
  "Content-Type": "application/json",
31
- "X-Title": "RAG Deep Research Agent"
32
  }
33
  data = {
34
  "model": model,
@@ -43,17 +33,12 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
43
  return result["choices"][0]["message"]["content"]
44
 
45
  def get_sources(topic, domains):
46
- query = topic
47
- if domains:
48
- domain_list = [d.strip() for d in domains.split(",") if d.strip()]
49
- if domain_list:
50
- query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
51
  with DDGS() as ddgs:
52
  return [{
53
  "title": r.get("title", "Untitled"),
54
  "snippet": r.get("body", ""),
55
  "url": r.get("href", "")
56
- } for r in ddgs.text(query, max_results=5)]
57
 
58
  def get_arxiv_papers(query):
59
  from urllib.parse import quote_plus
@@ -65,6 +50,17 @@ def get_arxiv_papers(query):
65
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
66
  } for e in feed.entries]
67
 
 
 
 
 
 
 
 
 
 
 
 
68
  def check_plagiarism(text, topic):
69
  hits = []
70
  for r in get_sources(topic, ""):
@@ -76,89 +72,90 @@ def check_plagiarism(text, topic):
76
  def generate_apa_citation(title, url, source):
77
  year = datetime.datetime.now().year
78
  label = {
79
- "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*", "local": "*Uploaded Document*"
80
  }.get(source, "*Web*")
81
  return f"{title}. ({year}). {label}. {url}"
82
 
83
- # --- RAG Processing from Uploaded Documents ---
84
- def process_uploaded_docs(uploaded_files):
85
- docs = []
86
- for file in uploaded_files:
87
- ext = file.name.split(".")[-1].lower()
88
- with open(f"/tmp/{file.name}", "wb") as f:
89
- f.write(file.read())
90
- loader = PyPDFLoader(f"/tmp/{file.name}") if ext == "pdf" else TextLoader(f"/tmp/{file.name}")
91
- docs.extend(loader.load())
92
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
93
- split_docs = splitter.split_documents(docs)
94
- vectorstore = FAISS.from_documents(split_docs, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
95
- return RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY), retriever=vectorstore.as_retriever())
96
-
97
- # --- UI Layout ---
98
- topic = st.text_input("πŸ’‘ Research Topic")
99
  report_type = st.selectbox("πŸ“„ Type of report", [
100
- "Summary - Short and fast", "Detailed Report", "Thorough Academic Research"
 
 
101
  ])
102
- tone = st.selectbox("🎯 Tone", [
103
- "Objective", "Persuasive", "Narrative"
 
 
104
  ])
105
- source_type = st.selectbox("🌐 Sources", ["Web", "Documents", "Hybrid"])
106
- custom_domains = st.text_input("πŸ” Query Domains (optional)", placeholder="example.com, site.org")
107
- uploaded_files = st.file_uploader("πŸ“ Upload PDFs or text documents", accept_multiple_files=True)
 
108
 
109
- if st.button("πŸ” Start Research"):
110
  try:
111
- collected, citations = "", []
112
- st.status("🧠 Agent initializing...")
113
-
114
- if source_type in ["Web", "Hybrid"]:
115
- with st.spinner("🌐 Collecting web and arXiv data..."):
116
- sources = get_sources(topic, custom_domains)
117
- for s in sources:
118
- collected += f"- [{s['title']}]({s['url']})\n> {s['snippet']}\n\n"
119
- citations.append(generate_apa_citation(s["title"], s["url"], "web"))
120
- for p in get_arxiv_papers(topic):
121
- collected += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
 
 
 
 
122
  citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
 
 
 
 
123
 
124
- if source_type in ["Documents", "Hybrid"] and uploaded_files:
125
- with st.spinner("πŸ“ Processing documents with RAG..."):
126
- rag_chain = process_uploaded_docs(uploaded_files)
127
- rag_output = rag_chain.run(f"Summarize everything about: {topic}")
128
- collected += f"\n**Document Insights:**\n\n{rag_output}\n"
129
- citations.append(generate_apa_citation("Uploaded Materials", "local", "local"))
130
 
131
- with st.spinner("πŸ“ Writing final report..."):
132
  prompt = f"""
133
- Topic: {topic}
 
134
  Tone: {tone}
135
  Report Type: {report_type}
136
 
137
  Sources:
138
- {collected}
139
 
140
- Now generate:
141
- 1. Research gap
142
- 2. Novel direction
143
- 3. Real-world example
144
- 4. Full article in markdown format
145
- """
146
- response = call_llm([{"role": "user", "content": prompt}], max_tokens=3000)
147
 
148
  st.subheader("πŸ“„ Research Report")
149
- st.markdown(response, unsafe_allow_html=True)
 
150
  st.markdown("### πŸ“š APA Citations")
151
  for c in citations:
152
  st.markdown(f"- {c}")
153
 
154
- with st.spinner("πŸ” Checking for plagiarism..."):
155
- overlaps = check_plagiarism(response, topic)
156
  if overlaps:
157
- st.warning("⚠️ Content overlap found")
158
  for h in overlaps:
159
- st.markdown(f"**{h['title']}** β€” [{h['url']}]({h['url']})")
160
  else:
161
- st.success("βœ… No overlap detected")
162
 
163
  except Exception as e:
164
- st.error(f"🚨 Error: {e}")
 
3
  import streamlit as st
4
  import requests
5
  import datetime
 
 
6
  from dotenv import load_dotenv
7
  from duckduckgo_search import DDGS
8
+ import feedparser
9
+ import time
10
  from fuzzywuzzy import fuzz
 
 
 
 
 
 
11
 
12
  load_dotenv()
13
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 
 
 
 
14
 
15
+ # --- Helper Functions ---
16
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
17
  url = "https://openrouter.ai/api/v1/chat/completions"
18
  headers = {
19
+ "Authorization": f"Bearer " + OPENROUTER_API_KEY,
20
  "Content-Type": "application/json",
21
+ "X-Title": "GPT Deep Research Agent"
22
  }
23
  data = {
24
  "model": model,
 
33
  return result["choices"][0]["message"]["content"]
34
 
35
  def get_sources(topic, domains):
 
 
 
 
 
36
  with DDGS() as ddgs:
37
  return [{
38
  "title": r.get("title", "Untitled"),
39
  "snippet": r.get("body", ""),
40
  "url": r.get("href", "")
41
+ } for r in ddgs.text(topic + " site:" + domains if domains else topic, max_results=5)]
42
 
43
  def get_arxiv_papers(query):
44
  from urllib.parse import quote_plus
 
50
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
51
  } for e in feed.entries]
52
 
53
+ def get_semantic_papers(query):
54
+ url = "https://api.semanticscholar.org/graph/v1/paper/search"
55
+ params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
56
+ response = requests.get(url, params=params)
57
+ papers = response.json().get("data", [])
58
+ return [{
59
+ "title": p.get("title"),
60
+ "summary": p.get("abstract", "No abstract available"),
61
+ "url": p.get("url")
62
+ } for p in papers]
63
+
64
  def check_plagiarism(text, topic):
65
  hits = []
66
  for r in get_sources(topic, ""):
 
72
  def generate_apa_citation(title, url, source):
73
  year = datetime.datetime.now().year
74
  label = {
75
+ "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
76
  }.get(source, "*Web*")
77
  return f"{title}. ({year}). {label}. {url}"
78
 
79
+ # --- Streamlit UI ---
80
+ st.set_page_config("Deep Research Bot", layout="wide")
81
+ st.title("πŸ€– Real-time Deep Research Agent")
82
+
83
+ st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time.")
84
+
85
+ topic = st.text_input("πŸ’‘ What would you like me to research next?")
 
 
 
 
 
 
 
 
 
86
  report_type = st.selectbox("πŸ“„ Type of report", [
87
+ "Summary - Short and fast (~2 min)",
88
+ "Detailed Report (~5 min)",
89
+ "Thorough Academic Research (~10 min)"
90
  ])
91
+ tone = st.selectbox("🎯 Tone of the report", [
92
+ "Objective - Impartial and unbiased presentation of facts and findings",
93
+ "Persuasive - Advocating a specific point of view",
94
+ "Narrative - Storytelling tone for layperson readers"
95
  ])
96
+ source_type = st.selectbox("🌐 Sources to include", [
97
+ "Web Only", "Academic Only", "Hybrid"
98
+ ])
99
+ custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
100
 
101
+ if st.button("Research"):
102
  try:
103
+ with st.status("Starting agent tasks..."):
104
+ st.info("🧠 Thinking through research questions...")
105
+ time.sleep(1)
106
+ st.info("🌐 Fetching data from selected sources...")
107
+
108
+ all_data, citations = "", []
109
+ if source_type in ["Web Only", "Hybrid"]:
110
+ web = get_sources(topic, custom_domains)
111
+ for w in web:
112
+ all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
113
+ citations.append(generate_apa_citation(w["title"], w["url"], "web"))
114
+ if source_type in ["Academic Only", "Hybrid"]:
115
+ arxiv = get_arxiv_papers(topic)
116
+ for p in arxiv:
117
+ all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
118
  citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
119
+ scholar = get_semantic_papers(topic)
120
+ for s in scholar:
121
+ all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
122
+ citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
123
 
124
+ st.success("Data collection complete!")
 
 
 
 
 
125
 
126
+ with st.spinner("πŸ“ Writing final research report..."):
127
  prompt = f"""
128
+ # Research Task: {topic}
129
+
130
  Tone: {tone}
131
  Report Type: {report_type}
132
 
133
  Sources:
134
+ {all_data}
135
 
136
+ Now, synthesize:
137
+ 1. Research questions and gap
138
+ 2. A novel insight or direction
139
+ 3. A real-world application scenario
140
+ 4. A {report_type.lower()} in academic markdown (no headings)
141
+ """
142
+ output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
143
 
144
  st.subheader("πŸ“„ Research Report")
145
+ st.markdown(output, unsafe_allow_html=True)
146
+
147
  st.markdown("### πŸ“š APA Citations")
148
  for c in citations:
149
  st.markdown(f"- {c}")
150
 
151
+ with st.spinner("πŸ§ͺ Checking for overlaps..."):
152
+ overlaps = check_plagiarism(output, topic)
153
  if overlaps:
154
+ st.warning("⚠️ Potential content overlap found.")
155
  for h in overlaps:
156
+ st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
157
  else:
158
+ st.success("βœ… No major overlaps detected.")
159
 
160
  except Exception as e:
161
+ st.error(f"Error: {e}")