Ani14 commited on
Commit
7d54951
Β·
verified Β·
1 Parent(s): 4083831

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -144
app.py CHANGED
@@ -3,44 +3,38 @@ import os
3
  import streamlit as st
4
  import requests
5
  import datetime
6
- from dotenv import load_dotenv
7
- from duckduckgo_search import DDGS
8
  import feedparser
9
- import time
10
- from fuzzywuzzy import fuzz
 
 
 
11
 
 
12
  load_dotenv()
13
- OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
14
-
15
- # --- Helper Functions ---
16
- def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=2048, temperature=0.7):
17
- url = "https://openrouter.ai/api/v1/chat/completions"
18
- headers = {
19
- "Authorization": f"Bearer " + OPENROUTER_API_KEY,
20
- "Content-Type": "application/json",
21
- "X-Title": "GPT Deep Research Agent"
22
- }
23
- data = {
24
- "model": model,
25
- "messages": messages,
26
- "max_tokens": max_tokens,
27
- "temperature": temperature
28
- }
29
- response = requests.post(url, headers=headers, json=data)
30
- result = response.json()
31
- if response.status_code != 200:
32
- raise RuntimeError(result.get("error", {}).get("message", "LLM API error"))
33
- return result["choices"][0]["message"]["content"]
34
-
35
- def get_sources(topic, domains):
36
- with DDGS() as ddgs:
37
- return [{
38
- "title": r.get("title", "Untitled"),
39
- "snippet": r.get("body", ""),
40
- "url": r.get("href", "")
41
- } for r in ddgs.text(topic + " site:" + domains if domains else topic, max_results=5)]
42
-
43
- def get_arxiv_papers(query):
44
  from urllib.parse import quote_plus
45
  url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=3"
46
  feed = feedparser.parse(url)
@@ -50,112 +44,106 @@ def get_arxiv_papers(query):
50
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
51
  } for e in feed.entries]
52
 
53
- def get_semantic_papers(query):
54
- url = "https://api.semanticscholar.org/graph/v1/paper/search"
55
- params = {"query": query, "limit": 3, "fields": "title,abstract,url"}
56
- response = requests.get(url, params=params)
57
- papers = response.json().get("data", [])
58
- return [{
59
- "title": p.get("title"),
60
- "summary": p.get("abstract", "No abstract available"),
61
- "url": p.get("url")
62
- } for p in papers]
63
-
64
- def check_plagiarism(text, topic):
65
- hits = []
66
- for r in get_sources(topic, ""):
67
- similarity = fuzz.token_set_ratio(text, r["snippet"])
68
- if similarity >= 75:
69
- hits.append(r)
70
- return hits
71
-
72
- def generate_apa_citation(title, url, source):
73
- year = datetime.datetime.now().year
74
- label = {
75
- "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web Source*"
76
- }.get(source, "*Web*")
77
- return f"{title}. ({year}). {label}. {url}"
78
-
79
- # --- Streamlit UI ---
80
- st.set_page_config("Deep Research Bot", layout="wide")
81
- st.title("πŸ€– Real-time Deep Research Agent")
82
-
83
- st.markdown("This powerful assistant autonomously gathers, analyzes, and synthesizes research from multiple sources in real-time.")
84
-
85
- topic = st.text_input("πŸ’‘ What would you like me to research next?")
86
- report_type = st.selectbox("πŸ“„ Type of report", [
87
- "Summary - Short and fast (~2 min)",
88
- "Detailed Report (~5 min)",
89
- "Thorough Academic Research (~10 min)"
90
- ])
91
- tone = st.selectbox("🎯 Tone of the report", [
92
- "Objective - Impartial and unbiased presentation of facts and findings",
93
- "Persuasive - Advocating a specific point of view",
94
- "Narrative - Storytelling tone for layperson readers"
95
- ])
96
- source_type = st.selectbox("🌐 Sources to include", [
97
- "Web Only", "Academic Only", "Hybrid"
98
- ])
99
- custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
100
-
101
- if st.button("Research"):
102
- try:
103
- with st.status("Starting agent tasks..."):
104
- st.info("🧠 Thinking through research questions...")
105
- time.sleep(1)
106
- st.info("🌐 Fetching data from selected sources...")
107
-
108
- all_data, citations = "", []
109
- if source_type in ["Web Only", "Hybrid"]:
110
- web = get_sources(topic, custom_domains)
111
- for w in web:
112
- all_data += f"- [{w['title']}]({w['url']})\n> {w['snippet']}\n\n"
113
- citations.append(generate_apa_citation(w["title"], w["url"], "web"))
114
- if source_type in ["Academic Only", "Hybrid"]:
115
- arxiv = get_arxiv_papers(topic)
116
- for p in arxiv:
117
- all_data += f"- [{p['title']}]({p['url']})\n> {p['summary'][:300]}...\n\n"
118
- citations.append(generate_apa_citation(p["title"], p["url"], "arxiv"))
119
- scholar = get_semantic_papers(topic)
120
- for s in scholar:
121
- all_data += f"- [{s['title']}]({s['url']})\n> {s['summary'][:300]}...\n\n"
122
- citations.append(generate_apa_citation(s["title"], s["url"], "semantic"))
123
-
124
- st.success("Data collection complete!")
125
-
126
- with st.spinner("πŸ“ Writing final research report..."):
127
- prompt = f"""
128
- # Research Task: {topic}
129
-
130
- Tone: {tone}
131
- Report Type: {report_type}
132
-
133
- Sources:
134
- {all_data}
135
-
136
- Now, synthesize:
137
- 1. Research questions and gap
138
- 2. A novel insight or direction
139
- 3. A real-world application scenario
140
- 4. A {report_type.lower()} in academic markdown (no headings)
141
- """
142
- output = call_llm([{"role": "user", "content": prompt}], max_tokens=3500)
143
-
144
- st.subheader("πŸ“„ Research Report")
145
- st.markdown(output, unsafe_allow_html=True)
146
-
147
- st.markdown("### πŸ“š APA Citations")
148
- for c in citations:
149
- st.markdown(f"- {c}")
150
-
151
- with st.spinner("πŸ§ͺ Checking for overlaps..."):
152
- overlaps = check_plagiarism(output, topic)
153
- if overlaps:
154
- st.warning("⚠️ Potential content overlap found.")
155
- for h in overlaps:
156
- st.markdown(f"**{h['title']}** - [{h['url']}]({h['url']})")
157
- else:
158
- st.success("βœ… No major overlaps detected.")
159
-
160
- except Exception as e:
161
- st.error(f"Error: {e}")
 
3
  import streamlit as st
4
  import requests
5
  import datetime
6
+ import openai
 
7
  import feedparser
8
+ from dotenv import load_dotenv
9
+ from tavily import TavilyClient
10
+ from PyPDF2 import PdfReader
11
+ import faiss
12
+ import numpy as np
13
 
14
+ # --- Load API Keys ---
15
  load_dotenv()
16
+ openai.api_key = os.getenv("OPENAI_API_KEY")
17
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
18
+ tavily = TavilyClient(api_key=TAVILY_API_KEY)
19
+
20
+ # --- Streamlit Config ---
21
+ st.set_page_config(page_title="GPT Researcher Agent", layout="wide")
22
+ st.title("πŸ“š GPT-Powered Research Assistant")
23
+
24
+ # --- Helper: APA Citation ---
25
+ def generate_apa_citation(title, url, source):
26
+ year = datetime.datetime.now().year
27
+ label = {
28
+ "arxiv": "*arXiv*", "semantic": "*Semantic Scholar*", "web": "*Web*"
29
+ }.get(source, "*Web*")
30
+ return f"{title}. ({year}). {label}. {url}"
31
+
32
+ # --- Search Tools ---
33
+ def tavily_search(query):
34
+ results = tavily.search(query, search_depth="advanced", max_results=5)
35
+ return results.get("results", [])
36
+
37
+ def arxiv_search(query):
 
 
 
 
 
 
 
 
 
38
  from urllib.parse import quote_plus
39
  url = f"http://export.arxiv.org/api/query?search_query=all:{quote_plus(query)}&start=0&max_results=3"
40
  feed = feedparser.parse(url)
 
44
  "url": next((l.href for l in e.links if l.type == "application/pdf"), "")
45
  } for e in feed.entries]
46
 
47
+ # --- Document Embedding ---
48
+ def embed_document(file):
49
+ doc_text = ""
50
+ if file.name.endswith(".pdf"):
51
+ reader = PdfReader(file)
52
+ for page in reader.pages:
53
+ text = page.extract_text()
54
+ if text:
55
+ doc_text += text
56
+ else:
57
+ doc_text = file.read().decode("utf-8")
58
+
59
+ chunks = [doc_text[i:i+1000] for i in range(0, len(doc_text), 1000)]
60
+ embeddings = openai.Embedding.create(input=chunks, model="text-embedding-ada-002")
61
+ vectors = [np.array(rec["embedding"], dtype=np.float32) for rec in embeddings["data"]]
62
+
63
+ dim = len(vectors[0])
64
+ index = faiss.IndexFlatL2(dim)
65
+ index.add(np.vstack(vectors))
66
+
67
+ return chunks, index
68
+
69
+ # --- Streaming GPT Call ---
70
+ def stream_response(messages):
71
+ response = openai.ChatCompletion.create(
72
+ model="gpt-4",
73
+ messages=messages,
74
+ max_tokens=3000,
75
+ stream=True
76
+ )
77
+ collected = ""
78
+ placeholder = st.empty()
79
+ for chunk in response:
80
+ delta = chunk["choices"][0].get("delta", {})
81
+ if "content" in delta:
82
+ token = delta["content"]
83
+ collected += token
84
+ placeholder.markdown(collected + "β–Œ")
85
+ placeholder.markdown(collected)
86
+ return collected
87
+
88
+ # --- Sidebar Input ---
89
+ with st.sidebar:
90
+ topic = st.text_input("πŸ” Research Topic", "AI in Sustainable Agriculture")
91
+ report_type = st.selectbox("πŸ“„ Report Type", ["Summary", "Detailed", "Academic Paper"])
92
+ tone = st.selectbox("🎯 Tone", ["Objective", "Scientific", "Persuasive"])
93
+ sources = st.selectbox("🌐 Sources", ["Web", "Documents", "Both"])
94
+ uploaded_file = st.file_uploader("πŸ“Ž Upload Document (PDF/TXT)", type=["pdf", "txt"])
95
+ start_button = st.button("πŸš€ Run Research")
96
+
97
+ # --- Main Agent Execution ---
98
+ if start_button and topic:
99
+ st.subheader("🧠 Agent Log")
100
+ with st.container():
101
+ st.markdown("<div style='max-height:300px; overflow-y:auto; background:#222; padding:10px; border-radius:10px;'>", unsafe_allow_html=True)
102
+ st.markdown("🧭 Starting research task...")
103
+ st.markdown(f"πŸ”Ž Topic: **{topic}** | Tone: _{tone}_ | Type: _{report_type}_")
104
+ st.markdown("</div>", unsafe_allow_html=True)
105
+
106
+ citations = []
107
+ context = ""
108
+
109
+ if sources in ["Web", "Both"]:
110
+ st.info("🌐 Searching web sources via Tavily...")
111
+ web_results = tavily_search(topic)
112
+ for r in web_results:
113
+ context += f"{r.get('content','')}
114
+ "
115
+ citations.append(generate_apa_citation(r.get("title", "Untitled"), r.get("url", "#"), "web"))
116
+
117
+ if sources in ["Documents", "Both"] and uploaded_file:
118
+ st.info("πŸ“„ Embedding and retrieving from uploaded document...")
119
+ chunks, index = embed_document(uploaded_file)
120
+ q_embed = openai.Embedding.create(input=[topic], model="text-embedding-ada-002")
121
+ q_vector = np.array(q_embed["data"][0]["embedding"], dtype=np.float32).reshape(1, -1)
122
+ D, I = index.search(q_vector, k=3)
123
+ for idx in I[0]:
124
+ context += chunks[idx] + "
125
+ "
126
+ citations.append(generate_apa_citation(uploaded_file.name, "Uploaded", "local"))
127
+
128
+ st.info("✍️ Generating final research report...")
129
+ messages = [
130
+ {"role": "system", "content": f"You are a research assistant. Write a {report_type.lower()} in a {tone.lower()} tone, citing sources."},
131
+ {"role": "user", "content": f"Topic: {topic}
132
+
133
+ Context:
134
+ {context}
135
+
136
+ Write a complete report in academic markdown format."}
137
+ ]
138
+
139
+ final_output = stream_response(messages)
140
+
141
+ # --- Show Output and Citations ---
142
+ st.subheader("πŸ“„ Final Report")
143
+ st.markdown(final_output, unsafe_allow_html=True)
144
+
145
+ st.subheader("πŸ“š References")
146
+ for cite in citations:
147
+ st.markdown(f"- {cite}")
148
+
149
+ st.download_button("πŸ’Ύ Download Markdown", final_output, file_name="report.md", mime="text/markdown")