Ani14 commited on
Commit
2b04ac1
Β·
verified Β·
1 Parent(s): 65c3858

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -166
app.py CHANGED
@@ -1,22 +1,26 @@
1
  import os
2
  import streamlit as st
3
  import requests
4
- import feedparser
5
  import datetime
6
- from fuzzywuzzy import fuzz
7
  from dotenv import load_dotenv
8
- from duckduckgo_search import DDGS
 
 
 
9
 
 
10
  load_dotenv()
11
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 
 
12
 
13
- # --- Call OpenRouter LLM ---
14
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
15
  url = "https://openrouter.ai/api/v1/chat/completions"
16
  headers = {
17
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
18
  "Content-Type": "application/json",
19
- "X-Title": "Autonomous Research Assistant"
20
  }
21
  data = {
22
  "model": model,
@@ -24,183 +28,138 @@ def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2
24
  "max_tokens": max_tokens,
25
  "temperature": temperature
26
  }
27
- try:
28
- response = requests.post(url, headers=headers, json=data)
29
- result = response.json()
30
- except Exception as e:
31
- raise RuntimeError(f"Failed to connect or parse response: {e}")
32
  if response.status_code != 200:
33
  raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
34
- if "choices" not in result:
35
- raise RuntimeError(f"Invalid response: {result}")
36
  return result["choices"][0]["message"]["content"]
37
 
38
- # --- Plagiarism Check ---
39
- def check_plagiarism(text, query, threshold=70):
40
- web_results = search_duckduckgo(query, max_results=5)
41
- plagiarized_snippets = []
42
- for result in web_results:
43
- snippet = result.get("snippet", "")
44
- similarity = fuzz.token_set_ratio(text, snippet)
45
- if similarity >= threshold:
46
- plagiarized_snippets.append({
47
- "title": result["title"],
48
- "url": result["url"],
49
- "snippet": snippet,
50
- "similarity": similarity
51
- })
52
- return plagiarized_snippets
53
-
54
- # --- Source Utilities ---
55
- def get_arxiv_papers(query, max_results=3):
56
  from urllib.parse import quote_plus
57
- url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results={max_results}"
58
  feed = feedparser.parse(url)
59
  return [{
60
- "title": e.title or "Untitled",
61
- "summary": (e.summary or "No summary available").replace("\n", " ").strip(),
62
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
63
  } for e in feed.entries]
64
 
65
- def get_semantic_scholar_papers(query, max_results=3):
66
  url = "https://api.semanticscholar.org/graph/v1/paper/search"
67
- params = {"query": query, "limit": max_results, "fields": "title,abstract,url"}
68
  response = requests.get(url, params=params)
69
  papers = response.json().get("data", [])
70
  return [{
71
- "title": p.get("title") or "Untitled",
72
- "summary": (p.get("abstract") or "No abstract available").strip(),
73
- "url": p.get("url", "")
74
  } for p in papers]
75
 
76
- def search_duckduckgo(query, max_results=3):
77
- with DDGS() as ddgs:
78
- return [{
79
- "title": r["title"] or "Untitled",
80
- "snippet": r["body"] or "",
81
- "url": r["href"] or ""
82
- } for r in ddgs.text(query, max_results=max_results)]
83
-
84
- def get_image_urls(query, max_images=3):
85
- with DDGS() as ddgs:
86
- return [img["image"] for img in ddgs.images(query, max_results=max_images)]
87
-
88
- def generate_apa_citation(title, url, source=""):
89
- current_year = datetime.datetime.now().year
90
- if source == "arxiv":
91
- return f"{title}. ({current_year}). *arXiv*. {url}"
92
- elif source == "semantic":
93
- return f"{title}. ({current_year}). *Semantic Scholar*. {url}"
94
- elif source == "web":
95
- return f"{title}. ({current_year}). *Web Source*. {url}"
96
- else:
97
- return f"{title}. ({current_year}). {url}"
98
-
99
- # --- Research Agent ---
100
- def autonomous_research_agent(topic):
101
- arxiv = get_arxiv_papers(topic)
102
- scholar = get_semantic_scholar_papers(topic)
103
- web = search_duckduckgo(topic)
104
- images = get_image_urls(topic)
105
-
106
- arxiv_md, arxiv_citations = "", []
107
- for p in arxiv:
108
- arxiv_md += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
109
- arxiv_citations.append(generate_apa_citation(p["title"], p["url"], source="arxiv"))
110
-
111
- scholar_md, scholar_citations = "", []
112
- for p in scholar:
113
- scholar_md += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
114
- scholar_citations.append(generate_apa_citation(p["title"], p["url"], source="semantic"))
115
-
116
- web_md, web_citations = "", []
117
- for w in web:
118
- web_md += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
119
- web_citations.append(generate_apa_citation(w["title"], w["url"], source="web"))
120
-
121
- prompt = f"""
122
- # Research Topic: {topic}
123
-
124
- ## ArXiv:
125
- {arxiv_md}
126
-
127
- ## Semantic Scholar:
128
- {scholar_md}
129
-
130
- ## Web Insights:
131
- {web_md}
132
-
133
- Now synthesize this information into:
134
- 1. A research gap
135
- 2. A novel research direction
136
- 3. A full markdown-formatted research article (continuous, no section labels, academic tone)
137
- """
138
- response = call_llm([{"role": "user", "content": prompt}], max_tokens=3000)
139
-
140
- # Append Sources
141
- response += "\n\n---\n### Sources Cited\n"
142
- if arxiv_md:
143
- response += "**ArXiv:**\n" + arxiv_md
144
- if scholar_md:
145
- response += "**Semantic Scholar:**\n" + scholar_md
146
- if web_md:
147
- response += "**Web:**\n" + web_md
148
-
149
- # APA Citations Section
150
- all_citations = arxiv_citations + scholar_citations + web_citations
151
- response += "\n---\n### πŸ“š APA Citations\n"
152
- for cite in all_citations:
153
- response += f"- {cite}\n"
154
-
155
- return response, images
156
 
157
  # --- Streamlit UI ---
158
- st.set_page_config("Autonomous Research Assistant", layout="wide")
159
- st.title("πŸ€– Autonomous AI Research Assistant")
160
-
161
- if "chat_history" not in st.session_state:
162
- st.session_state.chat_history = []
163
-
164
- topic = st.text_input("Enter a research topic:")
165
- if st.button("Run Research Agent"):
166
- with st.spinner("Gathering sources & thinking..."):
167
- try:
168
- response, images = autonomous_research_agent(topic)
169
-
170
- # Display images
171
- if images:
172
- st.subheader("πŸ–ΌοΈ Relevant Images")
173
- st.image(images, width=300)
174
-
175
- # Display markdown response
176
- st.session_state.chat_history.append({"role": "user", "content": topic})
177
- st.session_state.chat_history.append({"role": "assistant", "content": response})
178
- st.markdown(response)
179
-
180
- # Check for plagiarism
181
- plagiarism_hits = check_plagiarism(response, topic)
182
- if plagiarism_hits:
183
- st.warning("⚠️ Potential overlap with existing web content detected.")
184
- st.subheader("πŸ•΅οΈ Plagiarism Check Results")
185
- for hit in plagiarism_hits:
186
- st.markdown(f"**{hit['title']}** - [{hit['url']}]({hit['url']})")
187
- st.markdown(f"> _Similarity: {hit['similarity']}%_\n\n{hit['snippet']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  else:
189
- st.success("βœ… No significant overlaps found. Content appears original.")
190
- except Exception as e:
191
- st.error(f"Failed: {e}")
192
-
193
- # --- Follow-up Chat ---
194
- st.divider()
195
- st.subheader("πŸ’¬ Follow-up Q&A")
196
- followup = st.text_input("Ask a follow-up question:")
197
- if st.button("Ask"):
198
- if followup:
199
- try:
200
- chat = st.session_state.chat_history + [{"role": "user", "content": followup}]
201
- answer = call_llm(chat, max_tokens=1500)
202
- st.session_state.chat_history.append({"role": "user", "content": followup})
203
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
204
- st.markdown(answer)
205
- except Exception as e:
206
- st.error(f"Follow-up error: {e}")
 
1
  import os
2
  import streamlit as st
3
  import requests
 
4
  import datetime
 
5
  from dotenv import load_dotenv
6
+ from tavily import TavilyClient
7
+ import feedparser
8
+ import time
9
+ from fuzzywuzzy import fuzz
10
 
11
+ # Load environment variables
12
  load_dotenv()
13
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
14
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
15
+ tavily = TavilyClient(api_key=TAVILY_API_KEY)
16
 
17
+ # --- Helper Functions ---
18
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
19
  url = "https://openrouter.ai/api/v1/chat/completions"
20
  headers = {
21
  "Authorization": f"Bearer {OPENROUTER_API_KEY}",
22
  "Content-Type": "application/json",
23
+ "X-Title": "GPT Deep Research Agent"
24
  }
25
  data = {
26
  "model": model,
 
28
  "max_tokens": max_tokens,
29
  "temperature": temperature
30
  }
31
+ response = requests.post(url, headers=headers, json=data)
32
+ result = response.json()
 
 
 
33
  if response.status_code != 200:
34
  raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
 
 
35
  return result["choices"][0]["message"]["content"]
36
 
37
+ def get_sources(topic, domains):
38
+ query = topic
39
+ if domains and isinstance(domains, str):
40
+ domain_list = [d.strip() for d in domains.split(",") if d.strip()]
41
+ if domain_list:
42
+ query = " OR ".join([f"site:{d} {topic}" for d in domain_list])
43
+ results = tavily.search(query=query, search_depth="advanced", max_results=5)
44
+ return [{
45
+ "title": r.get("title", "Untitled"),
46
+ "snippet": r.get("content", ""),
47
+ "url": r.get("url", "")
48
+ } for r in results.get("results", [])]
49
+
50
+ def get_arxiv_papers(query):
 
 
 
 
51
  from urllib.parse import quote_plus
52
+ url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=3"
53
  feed = feedparser.parse(url)
54
  return [{
55
+ "title": e.title,
56
+ "summary": e.summary.replace("\n", " ").strip(),
57
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
58
  } for e in feed.entries]
59
 
60
+ def get_semantic_papers(query):
61
  url = "https://api.semanticscholar.org/graph/v1/paper/search"
62
+ params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
63
  response = requests.get(url, params=params)
64
  papers = response.json().get("data", [])
65
  return [{
66
+ "title": p.get("title"),
67
+ "summary": p.get("abstract", "No abstract available"),
68
+ "url": p.get("url")
69
  } for p in papers]
70
 
71
+ def check_plagiarism(text, topic):
72
+ hits = []
73
+ for r in get_sources(topic, ""):
74
+ similarity = fuzz.token_set_ratio(text, r["snippet"])
75
+ if similarity >= 75:
76
+ hits.append(r)
77
+ return hits
78
+
79
+ def generate_apa_citation(title, url, source):
80
+ year = datetime.datetime.now().year
81
+ label = {
82
+ "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
83
+ }.get(source, "*Web*")
84
+ return f"{title}. ({year}). {label}. {url}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # --- Streamlit UI ---
87
+ st.set_page_config("Deep Research Bot", layout="wide")
88
+ st.title("πŸ€– Real-time Deep Research Agent (Tavily Edition)")
89
+
90
+ st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time using Tavily.")
91
+
92
+ topic = st.text_input("πŸ’‘ What would you like me to research next?")
93
+ report_type = st.selectbox("πŸ“„ Type of report", [
94
+ "Summary - Short and fast (~2 min)",
95
+ "Detailed Report (~5 min)",
96
+ "Thorough Academic Research (~10 min)"
97
+ ])
98
+ tone = st.selectbox("🎯 Tone of the report", [
99
+ "Objective - Impartial and unbiased presentation of facts and findings",
100
+ "Persuasive - Advocating a specific point of view",
101
+ "Narrative - Storytelling tone for layperson readers"
102
+ ])
103
+ source_type = st.selectbox("🌐 Sources to include", [
104
+ "Web Only", "Academic Only", "Hybrid"
105
+ ])
106
+ custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
107
+
108
+ if st.button("Research"):
109
+ try:
110
+ with st.status("Starting agent tasks..."):
111
+ st.info("🧠 Thinking through research questions...")
112
+ time.sleep(1)
113
+ st.info("🌐 Fetching data from selected sources...")
114
+
115
+ all_data, citations = "", []
116
+ if source_type in ["Web Only", "Hybrid"]:
117
+ web = get_sources(topic, custom_domains)
118
+ for w in web:
119
+ all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
120
+ citations.append(generate_apa_citation(w["title"], w["url"], "web"))
121
+ if source_type in ["Academic Only", "Hybrid"]:
122
+ arxiv = get_arxiv_papers(topic)
123
+ for p in arxiv:
124
+ all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
125
+ citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
126
+ scholar = get_semantic_papers(topic)
127
+ for s in scholar:
128
+ all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
129
+ citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
130
+
131
+ st.success("Data collection complete!")
132
+
133
+ with st.spinner("πŸ“ Writing final research report..."):
134
+ prompt = f"""
135
+ # Research Task: {topic}
136
+ Tone: {tone}
137
+ Report Type: {report_type}
138
+ Sources:
139
+ {all_data}
140
+ Now, synthesize:
141
+ 1. Research questions and gap
142
+ 2. A novel insight or direction
143
+ 3. A real-world application scenario
144
+ 4. A {report_type.lower()} in academic markdown (no headings)
145
+ """
146
+ output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
147
+
148
+ st.subheader("πŸ“„ Research Report")
149
+ st.markdown(output, unsafe_allow_html=True)
150
+
151
+ st.markdown("### πŸ“š APA Citations")
152
+ for c in citations:
153
+ st.markdown(f"- {c}")
154
+
155
+ with st.spinner("πŸ§ͺ Checking for overlaps..."):
156
+ overlaps = check_plagiarism(output, topic)
157
+ if overlaps:
158
+ st.warning("⚠️ Potential content overlap found.")
159
+ for h in overlaps:
160
+ st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
161
  else:
162
+ st.success("βœ… No major overlaps detected.")
163
+
164
+ except Exception as e:
165
+ st.error(f"Error: {e}")