Spaces:

mgbam
/

Researcher

Sleeping

App Files Files Community

Researcher / app.py

mgbam

Create app.py

d01c5cc verified 23 days ago

raw

history blame

7.65 kB

	import os
	import requests
	import feedparser
	import networkx as nx
	import gradio as gr
	from transformers import pipeline
	import openai

	# --------------------------
	# 1. arXiv API Integration
	# --------------------------
	def fetch_arxiv_papers(search_query="Artificial Intelligence", max_results=5):
	"""
	Fetch paper metadata from the arXiv API using the legacy endpoint.
	By using the arXiv APIs, you are agreeing to arXiv's Terms of Use.

	Returns:
	List of dictionaries with keys: id, title, summary, published, authors.
	"""
	# arXiv API endpoint
	base_url = "http://export.arxiv.org/api/query?"
	# Construct query parameters: see arXiv API docs for details.
	query = f"search_query=all:{search_query}&start=0&max_results={max_results}"
	url = base_url + query
	response = requests.get(url)
	# Parse the Atom feed using feedparser
	feed = feedparser.parse(response.text)
	papers = []
	for entry in feed.entries:
	paper = {
	"id": entry.id,
	"title": entry.title.strip().replace("\n", " "),
	"summary": entry.summary.strip().replace("\n", " "),
	"published": entry.published,
	"authors": ", ".join(author.name for author in entry.authors)
	}
	papers.append(paper)
	return papers

	# --------------------------
	# 2. Build a Simple Knowledge Graph
	# --------------------------
	def build_knowledge_graph(papers):
	"""
	Create a directed knowledge graph from a list of papers.
	Here, a simple simulation links papers in publication order.
	In a real-world scenario, edges might be derived from citation relationships.

	Each node holds paper metadata; edges are added sequentially for demonstration.
	"""
	G = nx.DiGraph()
	for i, paper in enumerate(papers):
	# Use a short identifier like 'P1', 'P2', etc.
	node_id = f"P{i+1}"
	G.add_node(node_id, title=paper["title"], summary=paper["summary"], published=paper["published"], authors=paper["authors"])

	# Simulate citation relationships: for demo purposes, link each paper to the next one.
	# The context is a simple statement; in practice, this could be extracted citation context.
	for i in range(len(papers) - 1):
	source = f"P{i+1}"
	target = f"P{i+2}"
	context = f"Paper '{papers[i]['title']}' builds on the ideas in '{papers[i+1]['title']}'."
	G.add_edge(source, target, context=context)
	return G

	# --------------------------
	# 3. Semantic Summarization on Citation Contexts
	# --------------------------
	# Initialize the Hugging Face summarizer (using an open-source model)
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	def summarize_context(text):
	"""
	Given a text (e.g. simulated citation context), return a semantic summary.
	"""
	if not text.strip():
	return "No context available."
	summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
	return summary[0]["summary_text"]

	def enrich_graph_with_summaries(G):
	"""
	For each edge in the graph, compute a semantic summary of the citation context.
	Store the result as an edge attribute.
	"""
	for u, v, data in G.edges(data=True):
	context_text = data.get("context", "")
	data["semantic_summary"] = summarize_context(context_text)
	return G

	# --------------------------
	# 4. Generate Graph Summary Text
	# --------------------------
	def generate_graph_summary(G):
	"""
	Generate a text summary of the knowledge graph. For each edge, the summary will include:
	"Paper 'source_title' cites 'target_title': <semantic summary>"
	"""
	summary_lines = []
	for u, v, data in G.edges(data=True):
	source_title = G.nodes[u]["title"]
	target_title = G.nodes[v]["title"]
	sem_summary = data.get("semantic_summary", "No summary available.")
	line = f"Paper '{source_title}' cites '{target_title}': {sem_summary}"
	summary_lines.append(line)
	return "\n".join(summary_lines)

	# --------------------------
	# 5. Research Idea Generation using OpenAI
	# --------------------------
	# Set your OpenAI API key from the environment (ensure OPENAI_API_KEY is set)
	openai.api_key = os.getenv("OPENAI_API_KEY")

	def generate_research_ideas(graph_summary_text):
	"""
	Generate innovative research ideas using OpenAI's GPT model.
	The prompt includes the semantic graph summary.
	"""
	prompt = f"""
	Based on the following summary of research literature and their semantic relationships, propose innovative research ideas in the field of Artificial Intelligence:

	{graph_summary_text}

	Research Ideas:
	"""
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "You are an expert AI researcher."},
	{"role": "user", "content": prompt}
	],
	max_tokens=200,
	temperature=0.7,
	n=1,
	)
	ideas = response.choices[0].message.content.strip()
	return ideas

	# --------------------------
	# 6. Main Pipeline (Tie Everything Together)
	# --------------------------
	def process_arxiv_and_generate(search_query):
	"""
	Main function called via the Gradio interface.
	1. Fetches papers from arXiv (ensuring compliance with arXiv API Terms of Use).
	2. Builds and enriches a simulated knowledge graph.
	3. Generates a graph summary.
	4. Produces innovative research ideas using OpenAI's API.
	"""
	# Step 1: Fetch papers from arXiv (by using their API and respecting their terms)
	papers = fetch_arxiv_papers(search_query=search_query, max_results=5)
	if not papers:
	return "No papers were retrieved from arXiv. Please try a different query.", ""

	# Step 2: Build the knowledge graph from the retrieved papers
	G = build_knowledge_graph(papers)
	# Step 3: Enrich the graph by summarizing the (simulated) citation contexts
	G = enrich_graph_with_summaries(G)
	# Step 4: Generate a text summary of the graph
	graph_summary = generate_graph_summary(G)
	# Step 5: Generate research ideas using OpenAI's API
	research_ideas = generate_research_ideas(graph_summary)

	# Build a result text that shows the graph summary along with the generated ideas.
	return graph_summary, research_ideas

	# --------------------------
	# 7. Gradio Interface for Hugging Face Space
	# --------------------------
	demo = gr.Interface(
	fn=process_arxiv_and_generate,
	inputs=gr.components.Textbox(lines=1, label="Search Query for arXiv (e.g., 'Artificial Intelligence')", default="Artificial Intelligence"),
	outputs=[
	gr.outputs.Textbox(label="Knowledge Graph Summary"),
	gr.outputs.Textbox(label="Generated Research Ideas")
	],
	title="Graph of AI Ideas: Leveraging Knowledge Graphs, arXiv Metadata & LLMs",
	description=(
	"This Hugging Face Space application retrieves recent arXiv e-prints based on your search query "
	"and builds a simple knowledge graph (using simulated citation relationships) from the paper metadata. "
	"A Hugging Face summarization model enriches these simulated citation contexts, and the graph summary "
	"is then fed to OpenAI's GPT model to generate innovative AI research ideas.\n\n"
	"By using this application, you agree to the arXiv API Terms of Use. Please review the arXiv API documentation "
	"for guidelines on rate limits, attribution, and usage."
	),
	allow_flagging="never",
	)

	# Launch the Gradio interface (Hugging Face Spaces automatically runs this file)
	demo.launch()