Ani14 commited on
Commit
88c4eee
Β·
verified Β·
1 Parent(s): 7495b00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -76
app.py CHANGED
@@ -3,22 +3,32 @@ import os
3
  import streamlit as st
4
  import requests
5
  import datetime
 
 
6
  from dotenv import load_dotenv
7
  from duckduckgo_search import DDGS
8
- import feedparser
9
- import time
10
  from fuzzywuzzy import fuzz
 
 
 
 
 
 
11
 
12
  load_dotenv()
13
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 
 
 
 
14
 
15
- # --- Helper Functions ---
16
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
17
  url = "https://openrouter.ai/api/v1/chat/completions"
18
  headers = {
19
- "Authorization": f"Bearer " + OPENROUTER_API_KEY,
20
  "Content-Type": "application/json",
21
- "X-Title": "GPT Deep Research Agent"
22
  }
23
  data = {
24
  "model": model,
@@ -34,7 +44,7 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
34
 
35
  def get_sources(topic, domains):
36
  query = topic
37
- if domains and isinstance(domains, str):
38
  domain_list = [d.strip() for d in domains.split(",") if d.strip()]
39
  if domain_list:
40
  query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
@@ -55,17 +65,6 @@ def get_arxiv_papers(query):
55
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
56
  } for e in feed.entries]
57
 
58
- def get_semantic_papers(query):
59
- url = "https://api.semanticscholar.org/graph/v1/paper/search"
60
- params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
61
- response = requests.get(url, params=params)
62
- papers = response.json().get("data", [])
63
- return [{
64
- "title": p.get("title"),
65
- "summary": p.get("abstract", "No abstract available"),
66
- "url": p.get("url")
67
- } for p in papers]
68
-
69
  def check_plagiarism(text, topic):
70
  hits = []
71
  for r in get_sources(topic, ""):
@@ -77,90 +76,89 @@ def check_plagiarism(text, topic):
77
  def generate_apa_citation(title, url, source):
78
  year = datetime.datetime.now().year
79
  label = {
80
- "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
81
  }.get(source, "*Web*")
82
  return f"{title}. ({year}). {label}. {url}"
83
 
84
- # --- Streamlit UI ---
85
- st.set_page_config("Deep Research Bot", layout="wide")
86
- st.title("πŸ€– Real-time Deep Research Agent")
87
-
88
- st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time.")
89
-
90
- topic = st.text_input("πŸ’‘ What would you like me to research next?")
 
 
 
 
 
 
 
 
 
91
  report_type = st.selectbox("πŸ“„ Type of report", [
92
- "Summary - Short and fast (~2 min)",
93
- "Detailed Report (~5 min)",
94
- "Thorough Academic Research (~10 min)"
95
  ])
96
- tone = st.selectbox("🎯 Tone of the report", [
97
- "Objective - Impartial and unbiased presentation of facts and findings",
98
- "Persuasive - Advocating a specific point of view",
99
- "Narrative - Storytelling tone for layperson readers"
100
  ])
101
- source_type = st.selectbox("🌐 Sources to include", [
102
- "Web Only", "Academic Only", "Hybrid"
103
- ])
104
- custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
105
 
106
- if st.button("Research"):
107
  try:
108
- with st.status("Starting agent tasks..."):
109
- st.info("🧠 Thinking through research questions...")
110
- time.sleep(1)
111
- st.info("🌐 Fetching data from selected sources...")
112
-
113
- all_data, citations = "", []
114
- if source_type in ["Web Only", "Hybrid"]:
115
- web = get_sources(topic, custom_domains)
116
- for w in web:
117
- all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
118
- citations.append(generate_apa_citation(w["title"], w["url"], "web"))
119
- if source_type in ["Academic Only", "Hybrid"]:
120
- arxiv = get_arxiv_papers(topic)
121
- for p in arxiv:
122
- all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
123
  citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
124
- scholar = get_semantic_papers(topic)
125
- for s in scholar:
126
- all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
127
- citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
128
 
129
- st.success("Data collection complete!")
 
 
 
 
 
130
 
131
- with st.spinner("πŸ“ Writing final research report..."):
132
  prompt = f"""
133
- # Research Task: {topic}
134
-
135
  Tone: {tone}
136
  Report Type: {report_type}
137
 
138
  Sources:
139
- {all_data}
140
 
141
- Now, synthesize:
142
- 1. Research questions and gap
143
- 2. A novel insight or direction
144
- 3. A real-world application scenario
145
- 4. A {report_type.lower()} in academic markdown (no headings)
146
- """
147
- output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
148
 
149
  st.subheader("πŸ“„ Research Report")
150
- st.markdown(output, unsafe_allow_html=True)
151
-
152
  st.markdown("### πŸ“š APA Citations")
153
  for c in citations:
154
  st.markdown(f"- {c}")
155
 
156
- with st.spinner("πŸ§ͺ Checking for overlaps..."):
157
- overlaps = check_plagiarism(output, topic)
158
  if overlaps:
159
- st.warning("⚠️ Potential content overlap found.")
160
  for h in overlaps:
161
- st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
162
  else:
163
- st.success("βœ… No major overlaps detected.")
164
 
165
  except Exception as e:
166
- st.error(f"Error: {e}")
 
3
  import streamlit as st
4
  import requests
5
  import datetime
6
+ import time
7
+ import feedparser
8
  from dotenv import load_dotenv
9
  from duckduckgo_search import DDGS
 
 
10
  from fuzzywuzzy import fuzz
11
+ from langchain.embeddings.openai import OpenAIEmbeddings
12
+ from langchain.vectorstores import FAISS
13
+ from langchain.document_loaders import TextLoader, PyPDFLoader
14
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
15
+ from langchain.chains import RetrievalQA
16
+ from langchain.chat_models import ChatOpenAI
17
 
18
  load_dotenv()
19
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
20
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+
22
+ st.set_page_config("Advanced RAG Research Agent", layout="wide")
23
+ st.title("🧠 Advanced Deep Research Agent (RAG + Documents + Real-time)")
24
 
25
+ # --- Core Utilities ---
26
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
27
  url = "https://openrouter.ai/api/v1/chat/completions"
28
  headers = {
29
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
30
  "Content-Type": "application/json",
31
+ "X-Title": "RAG Deep Research Agent"
32
  }
33
  data = {
34
  "model": model,
 
44
 
45
  def get_sources(topic, domains):
46
  query = topic
47
+ if domains:
48
  domain_list = [d.strip() for d in domains.split(",") if d.strip()]
49
  if domain_list:
50
  query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
 
65
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
66
  } for e in feed.entries]
67
 
 
 
 
 
 
 
 
 
 
 
 
68
  def check_plagiarism(text, topic):
69
  hits = []
70
  for r in get_sources(topic, ""):
 
76
  def generate_apa_citation(title, url, source):
77
  year = datetime.datetime.now().year
78
  label = {
79
+ "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*", "local": "*Uploaded Document*"
80
  }.get(source, "*Web*")
81
  return f"{title}. ({year}). {label}. {url}"
82
 
83
+ # --- RAG Processing from Uploaded Documents ---
84
+ def process_uploaded_docs(uploaded_files):
85
+ docs = []
86
+ for file in uploaded_files:
87
+ ext = file.name.split(".")[-1].lower()
88
+ with open(f"/tmp/{file.name}", "wb") as f:
89
+ f.write(file.read())
90
+ loader = PyPDFLoader(f"/tmp/{file.name}") if ext == "pdf" else TextLoader(f"/tmp/{file.name}")
91
+ docs.extend(loader.load())
92
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
93
+ split_docs = splitter.split_documents(docs)
94
+ vectorstore = FAISS.from_documents(split_docs, OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY))
95
+ return RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY), retriever=vectorstore.as_retriever())
96
+
97
+ # --- UI Layout ---
98
+ topic = st.text_input("πŸ’‘ Research Topic")
99
  report_type = st.selectbox("πŸ“„ Type of report", [
100
+ "Summary - Short and fast", "Detailed Report", "Thorough Academic Research"
 
 
101
  ])
102
+ tone = st.selectbox("🎯 Tone", [
103
+ "Objective", "Persuasive", "Narrative"
 
 
104
  ])
105
+ source_type = st.selectbox("🌐 Sources", ["Web", "Documents", "Hybrid"])
106
+ custom_domains = st.text_input("πŸ” Query Domains (optional)", placeholder="example.com, site.org")
107
+ uploaded_files = st.file_uploader("πŸ“ Upload PDFs or text documents", accept_multiple_files=True)
 
108
 
109
+ if st.button("πŸ” Start Research"):
110
  try:
111
+ collected, citations = "", []
112
+ st.status("🧠 Agent initializing...")
113
+
114
+ if source_type in ["Web", "Hybrid"]:
115
+ with st.spinner("🌐 Collecting web and arXiv data..."):
116
+ sources = get_sources(topic, custom_domains)
117
+ for s in sources:
118
+ collected += f"- [{s['title']}]({s['url']})\n> {s['snippet']}\n\n"
119
+ citations.append(generate_apa_citation(s["title"], s["url"], "web"))
120
+ for p in get_arxiv_papers(topic):
121
+ collected += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
 
 
 
 
122
  citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
 
 
 
 
123
 
124
+ if source_type in ["Documents", "Hybrid"] and uploaded_files:
125
+ with st.spinner("πŸ“ Processing documents with RAG..."):
126
+ rag_chain = process_uploaded_docs(uploaded_files)
127
+ rag_output = rag_chain.run(f"Summarize everything about: {topic}")
128
+ collected += f"\n**Document Insights:**\n\n{rag_output}\n"
129
+ citations.append(generate_apa_citation("Uploaded Materials", "local", "local"))
130
 
131
+ with st.spinner("πŸ“ Writing final report..."):
132
  prompt = f"""
133
+ Topic: {topic}
 
134
  Tone: {tone}
135
  Report Type: {report_type}
136
 
137
  Sources:
138
+ {collected}
139
 
140
+ Now generate:
141
+ 1. Research gap
142
+ 2. Novel direction
143
+ 3. Real-world example
144
+ 4. Full article in markdown format
145
+ """
146
+ response = call_llm([{"role": "user", "content": prompt}], max_tokens=3000)
147
 
148
  st.subheader("πŸ“„ Research Report")
149
+ st.markdown(response, unsafe_allow_html=True)
 
150
  st.markdown("### πŸ“š APA Citations")
151
  for c in citations:
152
  st.markdown(f"- {c}")
153
 
154
+ with st.spinner("πŸ” Checking for plagiarism..."):
155
+ overlaps = check_plagiarism(response, topic)
156
  if overlaps:
157
+ st.warning("⚠️ Content overlap found")
158
  for h in overlaps:
159
+ st.markdown(f"**{h['title']}** β€” [{h['url']}]({h['url']})")
160
  else:
161
+ st.success("βœ… No overlap detected")
162
 
163
  except Exception as e:
164
+ st.error(f"🚨 Error: {e}")