|
import os |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from fastapi import FastAPI, HTTPException |
|
from neo4j import GraphDatabase, basic_auth |
|
import google.generativeai as genai |
|
import logging |
|
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.StreamHandler() |
|
|
|
] |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
NEO4J_URI = os.getenv("NEO4J_URI") |
|
NEO4J_USER = os.getenv("NEO4J_USER") |
|
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") |
|
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") |
|
|
|
|
|
if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD: |
|
logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.") |
|
|
|
|
|
|
|
|
|
app = FastAPI( |
|
title="Arxiv to Neo4j Importer", |
|
description="API to fetch research paper data from Arxiv, summarize it with Gemini, and add it to Neo4j.", |
|
version="1.0.0" |
|
) |
|
|
|
|
|
gemini_model = None |
|
if GEMINI_API_KEY: |
|
try: |
|
genai.configure(api_key=GEMINI_API_KEY) |
|
gemini_model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20") |
|
logger.info("Gemini API client initialized successfully.") |
|
except Exception as e: |
|
logger.warning(f"WARNING: Failed to initialize Gemini API client: {e}. Summary generation will be affected.") |
|
else: |
|
logger.warning("WARNING: GEMINI_API_KEY environment variable not set. Summary generation will be disabled.") |
|
|
|
|
|
|
|
def get_content(number: str, node_type: str) -> str: |
|
"""Fetches raw HTML content from Arxiv or other sources.""" |
|
redirect_links = { |
|
"Patent": f"https://patents.google.com/patent/{number}/en", |
|
"ResearchPaper": f"https://arxiv.org/abs/{number}" |
|
} |
|
|
|
url = redirect_links.get(node_type) |
|
if not url: |
|
logger.warning(f"Unknown node type: {node_type} for number {number}") |
|
return "" |
|
|
|
try: |
|
response = requests.get(url, timeout=10) |
|
response.raise_for_status() |
|
return response.content.decode('utf-8', errors='replace').replace("\n", "") |
|
except requests.exceptions.RequestException as e: |
|
logger.error(f"Request error for {node_type} number: {number} at URL {url}: {e}") |
|
return "" |
|
except Exception as e: |
|
logger.error(f"An unexpected error occurred in get_content for {number}: {e}") |
|
return "" |
|
|
|
def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict: |
|
"""Extracts information from an Arxiv research paper and generates a summary.""" |
|
raw_content = get_content(rp_number, node_type) |
|
|
|
rp_data = { |
|
"document": f"Arxiv {rp_number}", |
|
"arxiv_id": rp_number, |
|
"title": "Error fetching content or content not found", |
|
"abstract": "Error fetching content or content not found", |
|
"summary": "Summary not generated" |
|
} |
|
|
|
if not raw_content: |
|
logger.warning(f"No content fetched for Arxiv ID: {rp_number}") |
|
return rp_data |
|
|
|
try: |
|
soup = BeautifulSoup(raw_content, 'html.parser') |
|
|
|
|
|
title_tag = soup.find('h1', class_='title') |
|
if title_tag and title_tag.find('span', class_='descriptor'): |
|
title_text_candidate = title_tag.find('span', class_='descriptor').next_sibling |
|
if title_text_candidate and isinstance(title_text_candidate, str): |
|
rp_data["title"] = title_text_candidate.strip() |
|
else: |
|
rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip() |
|
elif title_tag : |
|
rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip() |
|
|
|
|
|
|
|
abstract_tag = soup.find('blockquote', class_='abstract') |
|
if abstract_tag: |
|
abstract_text = abstract_tag.get_text(strip=True) |
|
if abstract_text.lower().startswith('abstract'): |
|
|
|
prefix_end = abstract_text.lower().find('abstract') + len('abstract') |
|
if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':': |
|
prefix_end += 1 |
|
abstract_text = abstract_text[prefix_end:].strip() |
|
|
|
rp_data["abstract"] = abstract_text |
|
|
|
|
|
if rp_data["title"] == "Error fetching content or content not found" and not title_tag: |
|
rp_data["title"] = "Title not found on page" |
|
if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag: |
|
rp_data["abstract"] = "Abstract not found on page" |
|
|
|
|
|
if gemini_model and rp_data["abstract"] and \ |
|
not rp_data["abstract"].startswith("Error fetching content") and \ |
|
not rp_data["abstract"].startswith("Abstract not found"): |
|
|
|
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues. |
|
Focus on challenges, gaps, or novel aspects. |
|
Here is the document: <document>{rp_data['abstract']}<document>""" |
|
|
|
try: |
|
response = gemini_model.generate_content(prompt) |
|
rp_data["summary"] = response.text |
|
logger.info(f"Summary generated for Arxiv ID: {rp_number}") |
|
except Exception as e: |
|
logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}") |
|
rp_data["summary"] = "Error generating summary (API failure)" |
|
elif not gemini_model: |
|
rp_data["summary"] = "Summary not generated (Gemini API client not available)" |
|
else: |
|
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)" |
|
|
|
except Exception as e: |
|
logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}") |
|
|
|
return rp_data |
|
|
|
def add_nodes_to_neo4j(driver, data_list: list, node_label: str): |
|
"""Adds a list of nodes to Neo4j in a single transaction.""" |
|
if not data_list: |
|
logger.warning("No data provided to add_nodes_to_neo4j.") |
|
return 0 |
|
|
|
query = ( |
|
f"UNWIND $data as properties " |
|
f"MERGE (n:{node_label} {{arxiv_id: properties.arxiv_id}}) " |
|
f"ON CREATE SET n = properties " |
|
f"ON MATCH SET n += properties" |
|
) |
|
|
|
try: |
|
with driver.session(database="neo4j") as session: |
|
result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume()) |
|
nodes_created = result.counters.nodes_created |
|
|
|
if nodes_created > 0: |
|
logger.info(f"{nodes_created} new {node_label} node(s) added successfully.") |
|
|
|
summary = result.summary |
|
logger.info(f"MERGE operation for {node_label}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.") |
|
|
|
return nodes_created |
|
except Exception as e: |
|
logger.error(f"Neo4j Error - Failed to add/update {node_label} nodes: {e}") |
|
raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}") |
|
|
|
|
|
|
|
|
|
@app.post("/add_research_paper/{arxiv_id}", status_code=201) |
|
async def add_single_research_paper(arxiv_id: str): |
|
""" |
|
Fetches a research paper from Arxiv by its ID, extracts information, |
|
generates a summary, and adds/updates it as a 'ResearchPaper' node in Neo4j. |
|
""" |
|
node_type = "ResearchPaper" |
|
logger.info(f"Processing request for Arxiv ID: {arxiv_id}") |
|
|
|
if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD: |
|
logger.error("Neo4j database connection details are not configured on the server.") |
|
raise HTTPException(status_code=500, detail="Neo4j database connection details are not configured on the server.") |
|
|
|
|
|
paper_data = extract_research_paper_arxiv(arxiv_id, node_type) |
|
|
|
if paper_data["title"].startswith("Error fetching content") or paper_data["title"] == "Title not found on page": |
|
logger.warning(f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}") |
|
raise HTTPException(status_code=404, detail=f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}") |
|
|
|
|
|
driver_instance = None |
|
try: |
|
auth_token = basic_auth(NEO4J_USER, NEO4J_PASSWORD) |
|
driver_instance = GraphDatabase.driver(NEO4J_URI, auth=auth_token) |
|
driver_instance.verify_connectivity() |
|
logger.info("Successfully connected to Neo4j.") |
|
|
|
nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type) |
|
|
|
if nodes_created_count > 0 : |
|
message = f"Research paper {arxiv_id} was successfully added to Neo4j." |
|
status_code_response = 201 |
|
else: |
|
|
|
|
|
message = f"Research paper {arxiv_id} was processed (potentially updated if it already existed)." |
|
status_code_response = 200 |
|
|
|
logger.info(message) |
|
|
|
|
|
|
|
|
|
|
|
return { |
|
"message": message, |
|
"data": paper_data, |
|
"response_status_info": status_code_response |
|
} |
|
|
|
except HTTPException as e: |
|
logger.error(f"HTTPException during Neo4j operation for {arxiv_id}: {e.detail}") |
|
raise e |
|
except Exception as e: |
|
logger.error(f"An unexpected error occurred during Neo4j operation for {arxiv_id}: {e}", exc_info=True) |
|
raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}") |
|
finally: |
|
if driver_instance: |
|
driver_instance.close() |
|
logger.info("Neo4j connection closed.") |