adrienbrdne commited on
Commit
12ab024
·
verified ·
1 Parent(s): 9c1f7cf

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +107 -22
api.py CHANGED
@@ -80,9 +80,8 @@ def get_content(number: str, node_type: str) -> str:
80
  logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
81
  return ""
82
 
83
- def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
84
  """Extracts information from an Arxiv research paper and generates a summary."""
85
- raw_content = get_content(rp_number, node_type)
86
 
87
  rp_data = {
88
  "document": f"Arxiv {rp_number}", # ID for the paper
@@ -91,6 +90,8 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
91
  "summary": "Summary not yet generated" # Default summary
92
  }
93
 
 
 
94
  if not raw_content:
95
  logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
96
  return rp_data # Returns default error data
@@ -128,30 +129,114 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
128
  if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
129
  rp_data["abstract"] = "Abstract not found on page"
130
 
131
- # Generate summary with Gemini API if available and abstract exists
132
- if rp_data["abstract"] and \
133
- not rp_data["abstract"].startswith("Error fetching content") and \
134
- not rp_data["abstract"].startswith("Abstract not found"):
135
-
136
- prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
137
- Focus on challenges, gaps, or novel aspects.
138
- Here is the document: <document>{rp_data['abstract']}<document>"""
139
-
140
- try:
141
- model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
142
- response = model.generate_content(prompt)
143
 
144
- rp_data["summary"] = response.text
145
- logger.info(f"Summary generated for Arxiv ID: {rp_number}")
146
- except Exception as e:
147
- logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
148
- rp_data["summary"] = "Error generating summary (API failure)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  else:
150
- rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  except Exception as e:
153
- logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")
154
- return rp_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
157
  """Adds a list of nodes to Neo4j in a single transaction."""
 
80
  logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
81
  return ""
82
 
83
+ def extract_research_paper_arxiv(rp_number: str, node_type: str = "ResearchPaper") -> dict:
84
  """Extracts information from an Arxiv research paper and generates a summary."""
 
85
 
86
  rp_data = {
87
  "document": f"Arxiv {rp_number}", # ID for the paper
 
90
  "summary": "Summary not yet generated" # Default summary
91
  }
92
 
93
+ raw_content = get_content(rp_number, node_type)
94
+
95
  if not raw_content:
96
  logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
97
  return rp_data # Returns default error data
 
129
  if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
130
  rp_data["abstract"] = "Abstract not found on page"
131
 
132
+ except Exception as e:
133
+ logger.error(f"Failed to parse content for Arxiv ID {rp_number}: {e}")
134
+
135
+ # Generate summary with Gemini API if available and abstract exists
136
+ if rp_data["abstract"] and \
137
+ not rp_data["abstract"].startswith("Error fetching content") and \
138
+ not rp_data["abstract"].startswith("Abstract not found"):
139
+
140
+ prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
141
+ Focus on challenges, gaps, or novel aspects.
142
+ Here is the document: <document>{rp_data['abstract']}<document>"""
 
143
 
144
+ try:
145
+ model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
146
+ response = model.generate_content(prompt)
147
+
148
+ rp_data["summary"] = response.text
149
+ logger.info(f"Summary generated for Arxiv ID: {rp_number}")
150
+ except Exception as e:
151
+ logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
152
+ rp_data["summary"] = "Error generating summary (API failure)"
153
+ else:
154
+ rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
155
+ return rp_data
156
+
157
+ def extract_patent_data(patent_number: str, node_type: str = "Patent"):
158
+ """
159
+ Extracts information from a Google Patents page with robust error handling.
160
+ """
161
+ # Initialize a dictionary with default error messages for consistency.
162
+ patent_data = {
163
+ "document": f"Patent {patent_number}",
164
+ "title": "Error fetching content or content not found",
165
+ "description": "Error fetching content or content not found",
166
+ "claim": "Error fetching content or content not found",
167
+ "summary": "Summary not yet generated" # Default summary
168
+ }
169
+
170
+ # Use the generic get_content function to fetch the raw page content.
171
+ raw_content = get_content(patent_number, node_type)
172
+
173
+ if not raw_content:
174
+ logger.warning(f"No content fetched for Patent ID: {patent_number}")
175
+ return patent_data # Return the dictionary with default error messages.
176
+
177
+ try:
178
+ # Let BeautifulSoup handle the decoding from raw bytes.
179
+ soup = BeautifulSoup(raw_content, 'html.parser')
180
+
181
+ # --- Extract Title ---
182
+ title_tag = soup.find('meta', attrs={'name': 'DC.title'})
183
+ if title_tag and title_tag.get('content'):
184
+ patent_data["title"] = title_tag['content'].strip()
185
  else:
186
+ # Fallback to finding the title in an <h1> tag.
187
+ title_h1 = soup.find('h1', id='title')
188
+ if title_h1:
189
+ patent_data["title"] = title_h1.get_text(strip=True)
190
+
191
+ # --- Extract Description ---
192
+ description_section = soup.find('section', itemprop='description')
193
+ if description_section:
194
+ # Remove unnecessary nested spans to clean the output.
195
+ for src_text in description_section.find_all('span', class_='google-src-text'):
196
+ src_text.decompose()
197
+ patent_data["description"] = description_section.get_text(separator=' ', strip=True)
198
+
199
+ # --- Extract Claims ---
200
+ claims_section = soup.find('section', itemprop='claims')
201
+ if claims_section:
202
+ # Remove unnecessary nested spans here as well.
203
+ for src_text in claims_section.find_all('span', class_='google-src-text'):
204
+ src_text.decompose()
205
+ patent_data["claim"] = claims_section.get_text(separator=' ', strip=True)
206
+
207
+ # Update status message if specific sections were not found on the page.
208
+ if patent_data["title"] == "Error fetching content or content not found":
209
+ patent_data["title"] = "Title not found on page"
210
+ if patent_data["description"] == "Error fetching content or content not found":
211
+ patent_data["description"] = "Description not found on page"
212
+ if patent_data["claim"] == "Error fetching content or content not found":
213
+ patent_data["claim"] = "Claim not found on page"
214
 
215
  except Exception as e:
216
+ # Catch any unexpected errors during the parsing process.
217
+ logger.error(f"Failed to parse content for Patent ID {patent_number}: {e}")
218
+
219
+ # Generate summary with Gemini API if available and abstract exists
220
+ if rp_data["description"] and \
221
+ not rp_data["description"].startswith("Error fetching content") and \
222
+ not rp_data["description"].startswith("Description not found"):
223
+
224
+ prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
225
+ Focus on challenges, gaps, or novel aspects.
226
+ Here is the document: <document>{rp_data['abstract']}<document>"""
227
+
228
+ try:
229
+ model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
230
+ response = model.generate_content(prompt)
231
+
232
+ rp_data["summary"] = response.text
233
+ logger.info(f"Summary generated for Patent ID: {patent_number}")
234
+ except Exception as e:
235
+ logger.error(f"Error generating summary with Gemini for Patent ID {patent_number}: {e}")
236
+ rp_data["summary"] = "Error generating summary (API failure)"
237
+ else:
238
+ rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
239
+ return patent_data
240
 
241
  def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
242
  """Adds a list of nodes to Neo4j in a single transaction."""