File size: 7,652 Bytes
d01c5cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
import requests
import feedparser
import networkx as nx
import gradio as gr
from transformers import pipeline
import openai

# --------------------------
# 1. arXiv API Integration
# --------------------------
def fetch_arxiv_papers(search_query="Artificial Intelligence", max_results=5):
    """
    Fetch paper metadata from the arXiv API using the legacy endpoint.
    By using the arXiv APIs, you are agreeing to arXiv's Terms of Use.
    
    Returns:
        List of dictionaries with keys: id, title, summary, published, authors.
    """
    # arXiv API endpoint
    base_url = "http://export.arxiv.org/api/query?"
    # Construct query parameters: see arXiv API docs for details.
    query = f"search_query=all:{search_query}&start=0&max_results={max_results}"
    url = base_url + query
    response = requests.get(url)
    # Parse the Atom feed using feedparser
    feed = feedparser.parse(response.text)
    papers = []
    for entry in feed.entries:
        paper = {
            "id": entry.id,
            "title": entry.title.strip().replace("\n", " "),
            "summary": entry.summary.strip().replace("\n", " "),
            "published": entry.published,
            "authors": ", ".join(author.name for author in entry.authors)
        }
        papers.append(paper)
    return papers

# --------------------------
# 2. Build a Simple Knowledge Graph
# --------------------------
def build_knowledge_graph(papers):
    """
    Create a directed knowledge graph from a list of papers.
    Here, a simple simulation links papers in publication order.
    In a real-world scenario, edges might be derived from citation relationships.
    
    Each node holds paper metadata; edges are added sequentially for demonstration.
    """
    G = nx.DiGraph()
    for i, paper in enumerate(papers):
        # Use a short identifier like 'P1', 'P2', etc.
        node_id = f"P{i+1}"
        G.add_node(node_id, title=paper["title"], summary=paper["summary"], published=paper["published"], authors=paper["authors"])
    
    # Simulate citation relationships: for demo purposes, link each paper to the next one.
    # The context is a simple statement; in practice, this could be extracted citation context.
    for i in range(len(papers) - 1):
        source = f"P{i+1}"
        target = f"P{i+2}"
        context = f"Paper '{papers[i]['title']}' builds on the ideas in '{papers[i+1]['title']}'."
        G.add_edge(source, target, context=context)
    return G

# --------------------------
# 3. Semantic Summarization on Citation Contexts
# --------------------------
# Initialize the Hugging Face summarizer (using an open-source model)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_context(text):
    """
    Given a text (e.g. simulated citation context), return a semantic summary.
    """
    if not text.strip():
        return "No context available."
    summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
    return summary[0]["summary_text"]

def enrich_graph_with_summaries(G):
    """
    For each edge in the graph, compute a semantic summary of the citation context.
    Store the result as an edge attribute.
    """
    for u, v, data in G.edges(data=True):
        context_text = data.get("context", "")
        data["semantic_summary"] = summarize_context(context_text)
    return G

# --------------------------
# 4. Generate Graph Summary Text
# --------------------------
def generate_graph_summary(G):
    """
    Generate a text summary of the knowledge graph. For each edge, the summary will include:
    "Paper 'source_title' cites 'target_title': <semantic summary>"
    """
    summary_lines = []
    for u, v, data in G.edges(data=True):
        source_title = G.nodes[u]["title"]
        target_title = G.nodes[v]["title"]
        sem_summary = data.get("semantic_summary", "No summary available.")
        line = f"Paper '{source_title}' cites '{target_title}': {sem_summary}"
        summary_lines.append(line)
    return "\n".join(summary_lines)

# --------------------------
# 5. Research Idea Generation using OpenAI
# --------------------------
# Set your OpenAI API key from the environment (ensure OPENAI_API_KEY is set)
openai.api_key = os.getenv("OPENAI_API_KEY")

def generate_research_ideas(graph_summary_text):
    """
    Generate innovative research ideas using OpenAI's GPT model.
    The prompt includes the semantic graph summary.
    """
    prompt = f"""
Based on the following summary of research literature and their semantic relationships, propose innovative research ideas in the field of Artificial Intelligence:

{graph_summary_text}

Research Ideas:
"""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert AI researcher."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0.7,
        n=1,
    )
    ideas = response.choices[0].message.content.strip()
    return ideas

# --------------------------
# 6. Main Pipeline (Tie Everything Together)
# --------------------------
def process_arxiv_and_generate(search_query):
    """
    Main function called via the Gradio interface.
    1. Fetches papers from arXiv (ensuring compliance with arXiv API Terms of Use).
    2. Builds and enriches a simulated knowledge graph.
    3. Generates a graph summary.
    4. Produces innovative research ideas using OpenAI's API.
    """
    # Step 1: Fetch papers from arXiv (by using their API and respecting their terms)
    papers = fetch_arxiv_papers(search_query=search_query, max_results=5)
    if not papers:
        return "No papers were retrieved from arXiv. Please try a different query.", ""
    
    # Step 2: Build the knowledge graph from the retrieved papers
    G = build_knowledge_graph(papers)
    # Step 3: Enrich the graph by summarizing the (simulated) citation contexts
    G = enrich_graph_with_summaries(G)
    # Step 4: Generate a text summary of the graph
    graph_summary = generate_graph_summary(G)
    # Step 5: Generate research ideas using OpenAI's API
    research_ideas = generate_research_ideas(graph_summary)
    
    # Build a result text that shows the graph summary along with the generated ideas.
    return graph_summary, research_ideas

# --------------------------
# 7. Gradio Interface for Hugging Face Space
# --------------------------
demo = gr.Interface(
    fn=process_arxiv_and_generate,
    inputs=gr.components.Textbox(lines=1, label="Search Query for arXiv (e.g., 'Artificial Intelligence')", default="Artificial Intelligence"),
    outputs=[
        gr.outputs.Textbox(label="Knowledge Graph Summary"),
        gr.outputs.Textbox(label="Generated Research Ideas")
    ],
    title="Graph of AI Ideas: Leveraging Knowledge Graphs, arXiv Metadata & LLMs",
    description=(
        "This Hugging Face Space application retrieves recent arXiv e-prints based on your search query "
        "and builds a simple knowledge graph (using simulated citation relationships) from the paper metadata. "
        "A Hugging Face summarization model enriches these simulated citation contexts, and the graph summary "
        "is then fed to OpenAI's GPT model to generate innovative AI research ideas.\n\n"
        "By using this application, you agree to the arXiv API Terms of Use. Please review the arXiv API documentation "
        "for guidelines on rate limits, attribution, and usage."
    ),
    allow_flagging="never",
)

# Launch the Gradio interface (Hugging Face Spaces automatically runs this file)
demo.launch()