Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import feedparser | |
import networkx as nx | |
import gradio as gr | |
from transformers import pipeline | |
import openai | |
# -------------------------- | |
# 1. arXiv API Integration | |
# -------------------------- | |
def fetch_arxiv_papers(search_query="Artificial Intelligence", max_results=5): | |
""" | |
Fetch paper metadata from the arXiv API using the legacy endpoint. | |
By using the arXiv APIs, you are agreeing to arXiv's Terms of Use. | |
Returns: | |
List of dictionaries with keys: id, title, summary, published, authors. | |
""" | |
# arXiv API endpoint | |
base_url = "http://export.arxiv.org/api/query?" | |
# Construct query parameters: see arXiv API docs for details. | |
query = f"search_query=all:{search_query}&start=0&max_results={max_results}" | |
url = base_url + query | |
response = requests.get(url) | |
# Parse the Atom feed using feedparser | |
feed = feedparser.parse(response.text) | |
papers = [] | |
for entry in feed.entries: | |
paper = { | |
"id": entry.id, | |
"title": entry.title.strip().replace("\n", " "), | |
"summary": entry.summary.strip().replace("\n", " "), | |
"published": entry.published, | |
"authors": ", ".join(author.name for author in entry.authors) | |
} | |
papers.append(paper) | |
return papers | |
# -------------------------- | |
# 2. Build a Simple Knowledge Graph | |
# -------------------------- | |
def build_knowledge_graph(papers): | |
""" | |
Create a directed knowledge graph from a list of papers. | |
Here, a simple simulation links papers in publication order. | |
In a real-world scenario, edges might be derived from citation relationships. | |
Each node holds paper metadata; edges are added sequentially for demonstration. | |
""" | |
G = nx.DiGraph() | |
for i, paper in enumerate(papers): | |
# Use a short identifier like 'P1', 'P2', etc. | |
node_id = f"P{i+1}" | |
G.add_node(node_id, title=paper["title"], summary=paper["summary"], published=paper["published"], authors=paper["authors"]) | |
# Simulate citation relationships: for demo purposes, link each paper to the next one. | |
# The context is a simple statement; in practice, this could be extracted citation context. | |
for i in range(len(papers) - 1): | |
source = f"P{i+1}" | |
target = f"P{i+2}" | |
context = f"Paper '{papers[i]['title']}' builds on the ideas in '{papers[i+1]['title']}'." | |
G.add_edge(source, target, context=context) | |
return G | |
# -------------------------- | |
# 3. Semantic Summarization on Citation Contexts | |
# -------------------------- | |
# Initialize the Hugging Face summarizer (using an open-source model) | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
def summarize_context(text): | |
""" | |
Given a text (e.g. simulated citation context), return a semantic summary. | |
""" | |
if not text.strip(): | |
return "No context available." | |
summary = summarizer(text, max_length=50, min_length=25, do_sample=False) | |
return summary[0]["summary_text"] | |
def enrich_graph_with_summaries(G): | |
""" | |
For each edge in the graph, compute a semantic summary of the citation context. | |
Store the result as an edge attribute. | |
""" | |
for u, v, data in G.edges(data=True): | |
context_text = data.get("context", "") | |
data["semantic_summary"] = summarize_context(context_text) | |
return G | |
# -------------------------- | |
# 4. Generate Graph Summary Text | |
# -------------------------- | |
def generate_graph_summary(G): | |
""" | |
Generate a text summary of the knowledge graph. For each edge, the summary will include: | |
"Paper 'source_title' cites 'target_title': <semantic summary>" | |
""" | |
summary_lines = [] | |
for u, v, data in G.edges(data=True): | |
source_title = G.nodes[u]["title"] | |
target_title = G.nodes[v]["title"] | |
sem_summary = data.get("semantic_summary", "No summary available.") | |
line = f"Paper '{source_title}' cites '{target_title}': {sem_summary}" | |
summary_lines.append(line) | |
return "\n".join(summary_lines) | |
# -------------------------- | |
# 5. Research Idea Generation using OpenAI | |
# -------------------------- | |
# Set your OpenAI API key from the environment (ensure OPENAI_API_KEY is set) | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
def generate_research_ideas(graph_summary_text): | |
""" | |
Generate innovative research ideas using OpenAI's GPT model. | |
The prompt includes the semantic graph summary. | |
""" | |
prompt = f""" | |
Based on the following summary of research literature and their semantic relationships, propose innovative research ideas in the field of Artificial Intelligence: | |
{graph_summary_text} | |
Research Ideas: | |
""" | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "You are an expert AI researcher."}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=200, | |
temperature=0.7, | |
n=1, | |
) | |
ideas = response.choices[0].message.content.strip() | |
return ideas | |
# -------------------------- | |
# 6. Main Pipeline (Tie Everything Together) | |
# -------------------------- | |
def process_arxiv_and_generate(search_query): | |
""" | |
Main function called via the Gradio interface. | |
1. Fetches papers from arXiv (ensuring compliance with arXiv API Terms of Use). | |
2. Builds and enriches a simulated knowledge graph. | |
3. Generates a graph summary. | |
4. Produces innovative research ideas using OpenAI's API. | |
""" | |
# Step 1: Fetch papers from arXiv (by using their API and respecting their terms) | |
papers = fetch_arxiv_papers(search_query=search_query, max_results=5) | |
if not papers: | |
return "No papers were retrieved from arXiv. Please try a different query.", "" | |
# Step 2: Build the knowledge graph from the retrieved papers | |
G = build_knowledge_graph(papers) | |
# Step 3: Enrich the graph by summarizing the (simulated) citation contexts | |
G = enrich_graph_with_summaries(G) | |
# Step 4: Generate a text summary of the graph | |
graph_summary = generate_graph_summary(G) | |
# Step 5: Generate research ideas using OpenAI's API | |
research_ideas = generate_research_ideas(graph_summary) | |
# Build a result text that shows the graph summary along with the generated ideas. | |
return graph_summary, research_ideas | |
# -------------------------- | |
# 7. Gradio Interface for Hugging Face Space | |
# -------------------------- | |
demo = gr.Interface( | |
fn=process_arxiv_and_generate, | |
inputs=gr.components.Textbox(lines=1, label="Search Query for arXiv (e.g., 'Artificial Intelligence')", default="Artificial Intelligence"), | |
outputs=[ | |
gr.outputs.Textbox(label="Knowledge Graph Summary"), | |
gr.outputs.Textbox(label="Generated Research Ideas") | |
], | |
title="Graph of AI Ideas: Leveraging Knowledge Graphs, arXiv Metadata & LLMs", | |
description=( | |
"This Hugging Face Space application retrieves recent arXiv e-prints based on your search query " | |
"and builds a simple knowledge graph (using simulated citation relationships) from the paper metadata. " | |
"A Hugging Face summarization model enriches these simulated citation contexts, and the graph summary " | |
"is then fed to OpenAI's GPT model to generate innovative AI research ideas.\n\n" | |
"By using this application, you agree to the arXiv API Terms of Use. Please review the arXiv API documentation " | |
"for guidelines on rate limits, attribution, and usage." | |
), | |
allow_flagging="never", | |
) | |
# Launch the Gradio interface (Hugging Face Spaces automatically runs this file) | |
demo.launch() | |