adrienbrdne commited on
Commit
54f5fbc
·
verified ·
1 Parent(s): fbf2452

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +62 -55
api.py CHANGED
@@ -8,48 +8,53 @@ import logging # Import logging module
8
 
9
  # --- Logging Configuration ---
10
  # Basic logger configuration to display INFO messages and above.
11
- # The format includes timestamp, log level, and message.
12
- logging.basicConfig(
13
- level=logging.INFO,
14
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
15
- handlers=[
16
- logging.StreamHandler() # Display logs in the console (stderr by default)
17
- # You could add a logging.FileHandler("app.log") here to write to a file
18
- ]
19
- )
20
  logger = logging.getLogger(__name__) # Create a logger instance for this module
21
 
22
  # --- Environment Variable Configuration ---
23
  NEO4J_URI = os.getenv("NEO4J_URI")
24
  NEO4J_USER = os.getenv("NEO4J_USER")
25
  NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
26
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
27
 
28
  # Validation of essential configurations
29
  if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
30
  logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
31
- # In a real application, you might want to exit or prevent FastAPI from starting.
32
- # For now, we let the application try and fail at runtime if they are missing.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # Initialize FastAPI application
35
  app = FastAPI(
36
- title="Arxiv to Neo4j Importer",
37
- description="API to fetch research paper data from Arxiv, summarize it with Gemini, and add it to Neo4j.",
38
  version="1.0.0"
39
  )
40
 
41
- # --- Gemini API Client Initialization ---
42
- gemini_model = None
43
- if GEMINI_API_KEY:
44
- try:
45
- genai.configure(api_key=GEMINI_API_KEY)
46
- gemini_model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20") # Specified model
47
- logger.info("Gemini API client initialized successfully.")
48
- except Exception as e:
49
- logger.warning(f"WARNING: Failed to initialize Gemini API client: {e}. Summary generation will be affected.")
50
- else:
51
- logger.warning("WARNING: GEMINI_API_KEY environment variable not set. Summary generation will be disabled.")
52
-
53
  # --- Utility Functions (Adapted from your script) ---
54
 
55
  def get_content(number: str, node_type: str) -> str:
@@ -63,9 +68,9 @@ def get_content(number: str, node_type: str) -> str:
63
  if not url:
64
  logger.warning(f"Unknown node type: {node_type} for number {number}")
65
  return ""
66
-
67
  try:
68
- response = requests.get(url, timeout=10) # Added a timeout
69
  response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
70
  return response.content.decode('utf-8', errors='replace').replace("\n", "")
71
  except requests.exceptions.RequestException as e:
@@ -81,10 +86,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
81
 
82
  rp_data = {
83
  "document": f"Arxiv {rp_number}", # ID for the paper
84
- "arxiv_id": rp_number,
85
  "title": "Error fetching content or content not found",
86
  "abstract": "Error fetching content or content not found",
87
- "summary": "Summary not generated" # Default summary
88
  }
89
 
90
  if not raw_content:
@@ -97,9 +101,9 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
97
  # Extract Title
98
  title_tag = soup.find('h1', class_='title')
99
  if title_tag and title_tag.find('span', class_='descriptor'):
100
- title_text_candidate = title_tag.find('span', class_='descriptor').next_sibling
101
- if title_text_candidate and isinstance(title_text_candidate, str):
102
- rp_data["title"] = title_text_candidate.strip()
103
  else:
104
  rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
105
  elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
@@ -116,7 +120,6 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
116
  if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
117
  prefix_end += 1 # Include the colon in removal
118
  abstract_text = abstract_text[prefix_end:].strip()
119
-
120
  rp_data["abstract"] = abstract_text
121
 
122
  # Mark if title or abstract are still not found
@@ -126,62 +129,71 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
126
  rp_data["abstract"] = "Abstract not found on page"
127
 
128
  # Generate summary with Gemini API if available and abstract exists
129
- if gemini_model and rp_data["abstract"] and \
130
  not rp_data["abstract"].startswith("Error fetching content") and \
131
  not rp_data["abstract"].startswith("Abstract not found"):
132
- # English prompt for Gemini
133
  prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
134
  Focus on challenges, gaps, or novel aspects.
135
  Here is the document: <document>{rp_data['abstract']}<document>"""
136
 
137
  try:
138
- response = gemini_model.generate_content(prompt)
 
 
 
139
  rp_data["summary"] = response.text
140
  logger.info(f"Summary generated for Arxiv ID: {rp_number}")
141
  except Exception as e:
142
  logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
143
  rp_data["summary"] = "Error generating summary (API failure)"
144
- elif not gemini_model:
145
- rp_data["summary"] = "Summary not generated (Gemini API client not available)"
146
  else:
147
  rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
148
 
149
  except Exception as e:
150
  logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
151
-
152
  return rp_data
153
 
154
- def add_nodes_to_neo4j(driver, data_list: list, node_label: str):
155
  """Adds a list of nodes to Neo4j in a single transaction."""
156
  if not data_list:
157
  logger.warning("No data provided to add_nodes_to_neo4j.")
158
  return 0
159
 
160
  query = (
161
- f"UNWIND $data as properties "
162
- f"MERGE (n:{node_label} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
163
- f"ON CREATE SET n = properties "
164
- f"ON MATCH SET n += properties" # Update properties if the node already exists
165
  )
166
 
 
 
 
 
 
 
 
167
  try:
168
  with driver.session(database="neo4j") as session: # Specify database if not default
169
  result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
170
  nodes_created = result.counters.nodes_created
171
 
172
  if nodes_created > 0:
173
- logger.info(f"{nodes_created} new {node_label} node(s) added successfully.")
174
 
175
  summary = result.summary
176
- logger.info(f"MERGE operation for {node_label}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.")
177
-
178
  return nodes_created # Return the number of nodes actually created
179
  except Exception as e:
180
- logger.error(f"Neo4j Error - Failed to add/update {node_label} nodes: {e}")
181
  raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
182
 
183
 
184
  # --- FastAPI Endpoint ---
 
 
 
 
185
 
186
  @app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
187
  async def add_single_research_paper(arxiv_id: str):
@@ -214,13 +226,8 @@ async def add_single_research_paper(arxiv_id: str):
214
  nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
215
 
216
  if nodes_created_count > 0 :
217
- message = f"Research paper {arxiv_id} was successfully added to Neo4j."
218
  status_code_response = 201 # Created
219
- else:
220
- # If MERGE found an existing node and updated it, nodes_created_count will be 0.
221
- # This is considered a success (idempotency).
222
- message = f"Research paper {arxiv_id} was processed (potentially updated if it already existed)."
223
- status_code_response = 200 # OK (because no new creation, but operation successful)
224
 
225
  logger.info(message)
226
  # Note: FastAPI uses the status_code from the decorator or HTTPException.
 
8
 
9
  # --- Logging Configuration ---
10
  # Basic logger configuration to display INFO messages and above.
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
 
12
  logger = logging.getLogger(__name__) # Create a logger instance for this module
13
 
14
  # --- Environment Variable Configuration ---
15
  NEO4J_URI = os.getenv("NEO4J_URI")
16
  NEO4J_USER = os.getenv("NEO4J_USER")
17
  NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
 
18
 
19
  # Validation of essential configurations
20
  if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
21
  logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
22
+
23
+ # --- Application Lifecycle (Startup/Shutdown) ---
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ """Handles startup and shutdown events."""
27
+ # Initialize Gemini Client
28
+ logger.info("Initializing Gemini client...")
29
+ if genai:
30
+ try:
31
+ # Assuming GEMINI_API_KEY is set in environment or loaded via settings
32
+ api_key = os.getenv("GEMINI_API_KEY") or getattr(settings, "GEMINI_API_KEY", None)
33
+ if not api_key:
34
+ raise ValueError("GEMINI_API_KEY not found in environment or settings.")
35
+ genai.configure(api_key=api_key)
36
+ logger.info("Gemini client configured successfully.")
37
+ except Exception as e:
38
+ logger.error(f"Failed to configure Gemini client: {e}", exc_info=True)
39
+ else:
40
+ logger.warning("Gemini library not imported. Endpoints requiring Gemini will not work.")
41
+
42
+ yield # API runs here
43
+
44
+ # --- Shutdown ---
45
+ logger.info("API shutting down...")
46
+ # Close Neo4j connection (handled by atexit in graph_client.py)
47
+ # neo4j_client.close() # Usually not needed due to atexit registration
48
+ logger.info("Neo4j client closed (likely via atexit).")
49
+ logger.info("API shutdown complete.")
50
 
51
  # Initialize FastAPI application
52
  app = FastAPI(
53
+ title="Neo4j Importer",
54
+ description="API to fetch documents, summarize it with Gemini, and add it to Neo4j.",
55
  version="1.0.0"
56
  )
57
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # --- Utility Functions (Adapted from your script) ---
59
 
60
  def get_content(number: str, node_type: str) -> str:
 
68
  if not url:
69
  logger.warning(f"Unknown node type: {node_type} for number {number}")
70
  return ""
71
+
72
  try:
73
+ response = requests.get(url)
74
  response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
75
  return response.content.decode('utf-8', errors='replace').replace("\n", "")
76
  except requests.exceptions.RequestException as e:
 
86
 
87
  rp_data = {
88
  "document": f"Arxiv {rp_number}", # ID for the paper
 
89
  "title": "Error fetching content or content not found",
90
  "abstract": "Error fetching content or content not found",
91
+ "summary": "Summary not yet generated" # Default summary
92
  }
93
 
94
  if not raw_content:
 
101
  # Extract Title
102
  title_tag = soup.find('h1', class_='title')
103
  if title_tag and title_tag.find('span', class_='descriptor'):
104
+ title_text = title_tag.find('span', class_='descriptor').next_sibling
105
+ if title_text and isinstance(title_text, str):
106
+ rp_data["title"] = title_text.strip()
107
  else:
108
  rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
109
  elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
 
120
  if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
121
  prefix_end += 1 # Include the colon in removal
122
  abstract_text = abstract_text[prefix_end:].strip()
 
123
  rp_data["abstract"] = abstract_text
124
 
125
  # Mark if title or abstract are still not found
 
129
  rp_data["abstract"] = "Abstract not found on page"
130
 
131
  # Generate summary with Gemini API if available and abstract exists
132
+ if rp_data["abstract"] and \
133
  not rp_data["abstract"].startswith("Error fetching content") and \
134
  not rp_data["abstract"].startswith("Abstract not found"):
135
+
136
  prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
137
  Focus on challenges, gaps, or novel aspects.
138
  Here is the document: <document>{rp_data['abstract']}<document>"""
139
 
140
  try:
141
+ model_name = "gemini-2.5-flash-preview-05-20"
142
+ model = genai.GenerativeModel(model_name)
143
+
144
+ response = model.generate_content(prompt)
145
  rp_data["summary"] = response.text
146
  logger.info(f"Summary generated for Arxiv ID: {rp_number}")
147
  except Exception as e:
148
  logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
149
  rp_data["summary"] = "Error generating summary (API failure)"
 
 
150
  else:
151
  rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
152
 
153
  except Exception as e:
154
  logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
 
155
  return rp_data
156
 
157
+ def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
158
  """Adds a list of nodes to Neo4j in a single transaction."""
159
  if not data_list:
160
  logger.warning("No data provided to add_nodes_to_neo4j.")
161
  return 0
162
 
163
  query = (
164
+ "UNWIND $data as properties"
165
+ f"CREATE (n:{node_type})"
166
+ "SET n = properties"
 
167
  )
168
 
169
+ # query = (
170
+ # f"UNWIND $data as properties "
171
+ # f"MERGE (n:{node_type} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
172
+ # f"ON CREATE SET n = properties "
173
+ # f"ON MATCH SET n += properties" # Update properties if the node already exists
174
+ # )
175
+
176
  try:
177
  with driver.session(database="neo4j") as session: # Specify database if not default
178
  result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
179
  nodes_created = result.counters.nodes_created
180
 
181
  if nodes_created > 0:
182
+ logger.info(f"{nodes_created} new {node_type} node(s) added successfully.")
183
 
184
  summary = result.summary
185
+ logger.info(f"CREATE operation for {node_type}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.")
 
186
  return nodes_created # Return the number of nodes actually created
187
  except Exception as e:
188
+ logger.error(f"Neo4j Error - Failed to add/update {node_type} nodes: {e}")
189
  raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")
190
 
191
 
192
  # --- FastAPI Endpoint ---
193
+ # API state check route
194
+ @app.get("/")
195
+ def read_root():
196
+ return {"status": "ok"}
197
 
198
  @app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
199
  async def add_single_research_paper(arxiv_id: str):
 
226
  nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
227
 
228
  if nodes_created_count > 0 :
229
+ logger.info(message = f"Research paper {arxiv_id} was successfully added to Neo4j.")
230
  status_code_response = 201 # Created
 
 
 
 
 
231
 
232
  logger.info(message)
233
  # Note: FastAPI uses the status_code from the decorator or HTTPException.