Spaces:
Sleeping
Sleeping
File size: 7,652 Bytes
d01c5cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import os
import requests
import feedparser
import networkx as nx
import gradio as gr
from transformers import pipeline
import openai
# --------------------------
# 1. arXiv API Integration
# --------------------------
def fetch_arxiv_papers(search_query="Artificial Intelligence", max_results=5):
"""
Fetch paper metadata from the arXiv API using the legacy endpoint.
By using the arXiv APIs, you are agreeing to arXiv's Terms of Use.
Returns:
List of dictionaries with keys: id, title, summary, published, authors.
"""
# arXiv API endpoint
base_url = "http://export.arxiv.org/api/query?"
# Construct query parameters: see arXiv API docs for details.
query = f"search_query=all:{search_query}&start=0&max_results={max_results}"
url = base_url + query
response = requests.get(url)
# Parse the Atom feed using feedparser
feed = feedparser.parse(response.text)
papers = []
for entry in feed.entries:
paper = {
"id": entry.id,
"title": entry.title.strip().replace("\n", " "),
"summary": entry.summary.strip().replace("\n", " "),
"published": entry.published,
"authors": ", ".join(author.name for author in entry.authors)
}
papers.append(paper)
return papers
# --------------------------
# 2. Build a Simple Knowledge Graph
# --------------------------
def build_knowledge_graph(papers):
"""
Create a directed knowledge graph from a list of papers.
Here, a simple simulation links papers in publication order.
In a real-world scenario, edges might be derived from citation relationships.
Each node holds paper metadata; edges are added sequentially for demonstration.
"""
G = nx.DiGraph()
for i, paper in enumerate(papers):
# Use a short identifier like 'P1', 'P2', etc.
node_id = f"P{i+1}"
G.add_node(node_id, title=paper["title"], summary=paper["summary"], published=paper["published"], authors=paper["authors"])
# Simulate citation relationships: for demo purposes, link each paper to the next one.
# The context is a simple statement; in practice, this could be extracted citation context.
for i in range(len(papers) - 1):
source = f"P{i+1}"
target = f"P{i+2}"
context = f"Paper '{papers[i]['title']}' builds on the ideas in '{papers[i+1]['title']}'."
G.add_edge(source, target, context=context)
return G
# --------------------------
# 3. Semantic Summarization on Citation Contexts
# --------------------------
# Initialize the Hugging Face summarizer (using an open-source model)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def summarize_context(text):
"""
Given a text (e.g. simulated citation context), return a semantic summary.
"""
if not text.strip():
return "No context available."
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
return summary[0]["summary_text"]
def enrich_graph_with_summaries(G):
"""
For each edge in the graph, compute a semantic summary of the citation context.
Store the result as an edge attribute.
"""
for u, v, data in G.edges(data=True):
context_text = data.get("context", "")
data["semantic_summary"] = summarize_context(context_text)
return G
# --------------------------
# 4. Generate Graph Summary Text
# --------------------------
def generate_graph_summary(G):
"""
Generate a text summary of the knowledge graph. For each edge, the summary will include:
"Paper 'source_title' cites 'target_title': <semantic summary>"
"""
summary_lines = []
for u, v, data in G.edges(data=True):
source_title = G.nodes[u]["title"]
target_title = G.nodes[v]["title"]
sem_summary = data.get("semantic_summary", "No summary available.")
line = f"Paper '{source_title}' cites '{target_title}': {sem_summary}"
summary_lines.append(line)
return "\n".join(summary_lines)
# --------------------------
# 5. Research Idea Generation using OpenAI
# --------------------------
# Set your OpenAI API key from the environment (ensure OPENAI_API_KEY is set)
openai.api_key = os.getenv("OPENAI_API_KEY")
def generate_research_ideas(graph_summary_text):
"""
Generate innovative research ideas using OpenAI's GPT model.
The prompt includes the semantic graph summary.
"""
prompt = f"""
Based on the following summary of research literature and their semantic relationships, propose innovative research ideas in the field of Artificial Intelligence:
{graph_summary_text}
Research Ideas:
"""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are an expert AI researcher."},
{"role": "user", "content": prompt}
],
max_tokens=200,
temperature=0.7,
n=1,
)
ideas = response.choices[0].message.content.strip()
return ideas
# --------------------------
# 6. Main Pipeline (Tie Everything Together)
# --------------------------
def process_arxiv_and_generate(search_query):
"""
Main function called via the Gradio interface.
1. Fetches papers from arXiv (ensuring compliance with arXiv API Terms of Use).
2. Builds and enriches a simulated knowledge graph.
3. Generates a graph summary.
4. Produces innovative research ideas using OpenAI's API.
"""
# Step 1: Fetch papers from arXiv (by using their API and respecting their terms)
papers = fetch_arxiv_papers(search_query=search_query, max_results=5)
if not papers:
return "No papers were retrieved from arXiv. Please try a different query.", ""
# Step 2: Build the knowledge graph from the retrieved papers
G = build_knowledge_graph(papers)
# Step 3: Enrich the graph by summarizing the (simulated) citation contexts
G = enrich_graph_with_summaries(G)
# Step 4: Generate a text summary of the graph
graph_summary = generate_graph_summary(G)
# Step 5: Generate research ideas using OpenAI's API
research_ideas = generate_research_ideas(graph_summary)
# Build a result text that shows the graph summary along with the generated ideas.
return graph_summary, research_ideas
# --------------------------
# 7. Gradio Interface for Hugging Face Space
# --------------------------
demo = gr.Interface(
fn=process_arxiv_and_generate,
inputs=gr.components.Textbox(lines=1, label="Search Query for arXiv (e.g., 'Artificial Intelligence')", default="Artificial Intelligence"),
outputs=[
gr.outputs.Textbox(label="Knowledge Graph Summary"),
gr.outputs.Textbox(label="Generated Research Ideas")
],
title="Graph of AI Ideas: Leveraging Knowledge Graphs, arXiv Metadata & LLMs",
description=(
"This Hugging Face Space application retrieves recent arXiv e-prints based on your search query "
"and builds a simple knowledge graph (using simulated citation relationships) from the paper metadata. "
"A Hugging Face summarization model enriches these simulated citation contexts, and the graph summary "
"is then fed to OpenAI's GPT model to generate innovative AI research ideas.\n\n"
"By using this application, you agree to the arXiv API Terms of Use. Please review the arXiv API documentation "
"for guidelines on rate limits, attribution, and usage."
),
allow_flagging="never",
)
# Launch the Gradio interface (Hugging Face Spaces automatically runs this file)
demo.launch()
|