Update api.py
Browse files
api.py
CHANGED
@@ -8,48 +8,53 @@ import logging # Import logging module
|
|
8 |
|
9 |
# --- Logging Configuration ---
|
10 |
# Basic logger configuration to display INFO messages and above.
|
11 |
-
|
12 |
-
logging.basicConfig(
|
13 |
-
level=logging.INFO,
|
14 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
15 |
-
handlers=[
|
16 |
-
logging.StreamHandler() # Display logs in the console (stderr by default)
|
17 |
-
# You could add a logging.FileHandler("app.log") here to write to a file
|
18 |
-
]
|
19 |
-
)
|
20 |
logger = logging.getLogger(__name__) # Create a logger instance for this module
|
21 |
|
22 |
# --- Environment Variable Configuration ---
|
23 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
24 |
NEO4J_USER = os.getenv("NEO4J_USER")
|
25 |
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
26 |
-
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
27 |
|
28 |
# Validation of essential configurations
|
29 |
if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
|
30 |
logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Initialize FastAPI application
|
35 |
app = FastAPI(
|
36 |
-
title="
|
37 |
-
description="API to fetch
|
38 |
version="1.0.0"
|
39 |
)
|
40 |
|
41 |
-
# --- Gemini API Client Initialization ---
|
42 |
-
gemini_model = None
|
43 |
-
if GEMINI_API_KEY:
|
44 |
-
try:
|
45 |
-
genai.configure(api_key=GEMINI_API_KEY)
|
46 |
-
gemini_model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20") # Specified model
|
47 |
-
logger.info("Gemini API client initialized successfully.")
|
48 |
-
except Exception as e:
|
49 |
-
logger.warning(f"WARNING: Failed to initialize Gemini API client: {e}. Summary generation will be affected.")
|
50 |
-
else:
|
51 |
-
logger.warning("WARNING: GEMINI_API_KEY environment variable not set. Summary generation will be disabled.")
|
52 |
-
|
53 |
# --- Utility Functions (Adapted from your script) ---
|
54 |
|
55 |
def get_content(number: str, node_type: str) -> str:
|
@@ -63,9 +68,9 @@ def get_content(number: str, node_type: str) -> str:
|
|
63 |
if not url:
|
64 |
logger.warning(f"Unknown node type: {node_type} for number {number}")
|
65 |
return ""
|
66 |
-
|
67 |
try:
|
68 |
-
response = requests.get(url
|
69 |
response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
|
70 |
return response.content.decode('utf-8', errors='replace').replace("\n", "")
|
71 |
except requests.exceptions.RequestException as e:
|
@@ -81,10 +86,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
81 |
|
82 |
rp_data = {
|
83 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
84 |
-
"arxiv_id": rp_number,
|
85 |
"title": "Error fetching content or content not found",
|
86 |
"abstract": "Error fetching content or content not found",
|
87 |
-
"summary": "Summary not generated" # Default summary
|
88 |
}
|
89 |
|
90 |
if not raw_content:
|
@@ -97,9 +101,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
97 |
# Extract Title
|
98 |
title_tag = soup.find('h1', class_='title')
|
99 |
if title_tag and title_tag.find('span', class_='descriptor'):
|
100 |
-
|
101 |
-
if
|
102 |
-
rp_data["title"] =
|
103 |
else:
|
104 |
rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
|
105 |
elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
|
@@ -116,7 +120,6 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
116 |
if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
|
117 |
prefix_end += 1 # Include the colon in removal
|
118 |
abstract_text = abstract_text[prefix_end:].strip()
|
119 |
-
|
120 |
rp_data["abstract"] = abstract_text
|
121 |
|
122 |
# Mark if title or abstract are still not found
|
@@ -126,62 +129,71 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
126 |
rp_data["abstract"] = "Abstract not found on page"
|
127 |
|
128 |
# Generate summary with Gemini API if available and abstract exists
|
129 |
-
if
|
130 |
not rp_data["abstract"].startswith("Error fetching content") and \
|
131 |
not rp_data["abstract"].startswith("Abstract not found"):
|
132 |
-
|
133 |
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
134 |
Focus on challenges, gaps, or novel aspects.
|
135 |
Here is the document: <document>{rp_data['abstract']}<document>"""
|
136 |
|
137 |
try:
|
138 |
-
|
|
|
|
|
|
|
139 |
rp_data["summary"] = response.text
|
140 |
logger.info(f"Summary generated for Arxiv ID: {rp_number}")
|
141 |
except Exception as e:
|
142 |
logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
|
143 |
rp_data["summary"] = "Error generating summary (API failure)"
|
144 |
-
elif not gemini_model:
|
145 |
-
rp_data["summary"] = "Summary not generated (Gemini API client not available)"
|
146 |
else:
|
147 |
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
148 |
|
149 |
except Exception as e:
|
150 |
logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
|
151 |
-
|
152 |
return rp_data
|
153 |
|
154 |
-
def add_nodes_to_neo4j(driver, data_list: list,
|
155 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|
156 |
if not data_list:
|
157 |
logger.warning("No data provided to add_nodes_to_neo4j.")
|
158 |
return 0
|
159 |
|
160 |
query = (
|
161 |
-
|
162 |
-
f"
|
163 |
-
|
164 |
-
f"ON MATCH SET n += properties" # Update properties if the node already exists
|
165 |
)
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
try:
|
168 |
with driver.session(database="neo4j") as session: # Specify database if not default
|
169 |
result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
|
170 |
nodes_created = result.counters.nodes_created
|
171 |
|
172 |
if nodes_created > 0:
|
173 |
-
logger.info(f"{nodes_created} new {
|
174 |
|
175 |
summary = result.summary
|
176 |
-
logger.info(f"
|
177 |
-
|
178 |
return nodes_created # Return the number of nodes actually created
|
179 |
except Exception as e:
|
180 |
-
logger.error(f"Neo4j Error - Failed to add/update {
|
181 |
raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
|
182 |
|
183 |
|
184 |
# --- FastAPI Endpoint ---
|
|
|
|
|
|
|
|
|
185 |
|
186 |
@app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
|
187 |
async def add_single_research_paper(arxiv_id: str):
|
@@ -214,13 +226,8 @@ async def add_single_research_paper(arxiv_id: str):
|
|
214 |
nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
|
215 |
|
216 |
if nodes_created_count > 0 :
|
217 |
-
message = f"Research paper {arxiv_id} was successfully added to Neo4j."
|
218 |
status_code_response = 201 # Created
|
219 |
-
else:
|
220 |
-
# If MERGE found an existing node and updated it, nodes_created_count will be 0.
|
221 |
-
# This is considered a success (idempotency).
|
222 |
-
message = f"Research paper {arxiv_id} was processed (potentially updated if it already existed)."
|
223 |
-
status_code_response = 200 # OK (because no new creation, but operation successful)
|
224 |
|
225 |
logger.info(message)
|
226 |
# Note: FastAPI uses the status_code from the decorator or HTTPException.
|
|
|
8 |
|
9 |
# --- Logging Configuration ---
|
10 |
# Basic logger configuration to display INFO messages and above.
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
logger = logging.getLogger(__name__) # Create a logger instance for this module
|
13 |
|
14 |
# --- Environment Variable Configuration ---
|
15 |
NEO4J_URI = os.getenv("NEO4J_URI")
|
16 |
NEO4J_USER = os.getenv("NEO4J_USER")
|
17 |
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
|
|
|
18 |
|
19 |
# Validation of essential configurations
|
20 |
if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
|
21 |
logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
|
22 |
+
|
23 |
+
# --- Application Lifecycle (Startup/Shutdown) ---
|
24 |
+
@asynccontextmanager
|
25 |
+
async def lifespan(app: FastAPI):
|
26 |
+
"""Handles startup and shutdown events."""
|
27 |
+
# Initialize Gemini Client
|
28 |
+
logger.info("Initializing Gemini client...")
|
29 |
+
if genai:
|
30 |
+
try:
|
31 |
+
# Assuming GEMINI_API_KEY is set in environment or loaded via settings
|
32 |
+
api_key = os.getenv("GEMINI_API_KEY") or getattr(settings, "GEMINI_API_KEY", None)
|
33 |
+
if not api_key:
|
34 |
+
raise ValueError("GEMINI_API_KEY not found in environment or settings.")
|
35 |
+
genai.configure(api_key=api_key)
|
36 |
+
logger.info("Gemini client configured successfully.")
|
37 |
+
except Exception as e:
|
38 |
+
logger.error(f"Failed to configure Gemini client: {e}", exc_info=True)
|
39 |
+
else:
|
40 |
+
logger.warning("Gemini library not imported. Endpoints requiring Gemini will not work.")
|
41 |
+
|
42 |
+
yield # API runs here
|
43 |
+
|
44 |
+
# --- Shutdown ---
|
45 |
+
logger.info("API shutting down...")
|
46 |
+
# Close Neo4j connection (handled by atexit in graph_client.py)
|
47 |
+
# neo4j_client.close() # Usually not needed due to atexit registration
|
48 |
+
logger.info("Neo4j client closed (likely via atexit).")
|
49 |
+
logger.info("API shutdown complete.")
|
50 |
|
51 |
# Initialize FastAPI application
|
52 |
app = FastAPI(
|
53 |
+
title="Neo4j Importer",
|
54 |
+
description="API to fetch documents, summarize it with Gemini, and add it to Neo4j.",
|
55 |
version="1.0.0"
|
56 |
)
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# --- Utility Functions (Adapted from your script) ---
|
59 |
|
60 |
def get_content(number: str, node_type: str) -> str:
|
|
|
68 |
if not url:
|
69 |
logger.warning(f"Unknown node type: {node_type} for number {number}")
|
70 |
return ""
|
71 |
+
|
72 |
try:
|
73 |
+
response = requests.get(url)
|
74 |
response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
|
75 |
return response.content.decode('utf-8', errors='replace').replace("\n", "")
|
76 |
except requests.exceptions.RequestException as e:
|
|
|
86 |
|
87 |
rp_data = {
|
88 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
|
|
89 |
"title": "Error fetching content or content not found",
|
90 |
"abstract": "Error fetching content or content not found",
|
91 |
+
"summary": "Summary not yet generated" # Default summary
|
92 |
}
|
93 |
|
94 |
if not raw_content:
|
|
|
101 |
# Extract Title
|
102 |
title_tag = soup.find('h1', class_='title')
|
103 |
if title_tag and title_tag.find('span', class_='descriptor'):
|
104 |
+
title_text = title_tag.find('span', class_='descriptor').next_sibling
|
105 |
+
if title_text and isinstance(title_text, str):
|
106 |
+
rp_data["title"] = title_text.strip()
|
107 |
else:
|
108 |
rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
|
109 |
elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
|
|
|
120 |
if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
|
121 |
prefix_end += 1 # Include the colon in removal
|
122 |
abstract_text = abstract_text[prefix_end:].strip()
|
|
|
123 |
rp_data["abstract"] = abstract_text
|
124 |
|
125 |
# Mark if title or abstract are still not found
|
|
|
129 |
rp_data["abstract"] = "Abstract not found on page"
|
130 |
|
131 |
# Generate summary with Gemini API if available and abstract exists
|
132 |
+
if rp_data["abstract"] and \
|
133 |
not rp_data["abstract"].startswith("Error fetching content") and \
|
134 |
not rp_data["abstract"].startswith("Abstract not found"):
|
135 |
+
|
136 |
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
137 |
Focus on challenges, gaps, or novel aspects.
|
138 |
Here is the document: <document>{rp_data['abstract']}<document>"""
|
139 |
|
140 |
try:
|
141 |
+
model_name = "gemini-2.5-flash-preview-05-20"
|
142 |
+
model = genai.GenerativeModel(model_name)
|
143 |
+
|
144 |
+
response = model.generate_content(prompt)
|
145 |
rp_data["summary"] = response.text
|
146 |
logger.info(f"Summary generated for Arxiv ID: {rp_number}")
|
147 |
except Exception as e:
|
148 |
logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
|
149 |
rp_data["summary"] = "Error generating summary (API failure)"
|
|
|
|
|
150 |
else:
|
151 |
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
152 |
|
153 |
except Exception as e:
|
154 |
logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
|
|
|
155 |
return rp_data
|
156 |
|
157 |
+
def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
|
158 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|
159 |
if not data_list:
|
160 |
logger.warning("No data provided to add_nodes_to_neo4j.")
|
161 |
return 0
|
162 |
|
163 |
query = (
|
164 |
+
"UNWIND $data as properties"
|
165 |
+
f"CREATE (n:{node_type})"
|
166 |
+
"SET n = properties"
|
|
|
167 |
)
|
168 |
|
169 |
+
# query = (
|
170 |
+
# f"UNWIND $data as properties "
|
171 |
+
# f"MERGE (n:{node_type} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
|
172 |
+
# f"ON CREATE SET n = properties "
|
173 |
+
# f"ON MATCH SET n += properties" # Update properties if the node already exists
|
174 |
+
# )
|
175 |
+
|
176 |
try:
|
177 |
with driver.session(database="neo4j") as session: # Specify database if not default
|
178 |
result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
|
179 |
nodes_created = result.counters.nodes_created
|
180 |
|
181 |
if nodes_created > 0:
|
182 |
+
logger.info(f"{nodes_created} new {node_type} node(s) added successfully.")
|
183 |
|
184 |
summary = result.summary
|
185 |
+
logger.info(f"CREATE operation for {node_type}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.")
|
|
|
186 |
return nodes_created # Return the number of nodes actually created
|
187 |
except Exception as e:
|
188 |
+
logger.error(f"Neo4j Error - Failed to add/update {node_type} nodes: {e}")
|
189 |
raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
|
190 |
|
191 |
|
192 |
# --- FastAPI Endpoint ---
|
193 |
+
# API state check route
|
194 |
+
@app.get("/")
|
195 |
+
def read_root():
|
196 |
+
return {"status": "ok"}
|
197 |
|
198 |
@app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
|
199 |
async def add_single_research_paper(arxiv_id: str):
|
|
|
226 |
nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
|
227 |
|
228 |
if nodes_created_count > 0 :
|
229 |
+
logger.info(message = f"Research paper {arxiv_id} was successfully added to Neo4j.")
|
230 |
status_code_response = 201 # Created
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
logger.info(message)
|
233 |
# Note: FastAPI uses the status_code from the decorator or HTTPException.
|