Update api.py
Browse files
api.py
CHANGED
@@ -80,9 +80,8 @@ def get_content(number: str, node_type: str) -> str:
|
|
80 |
logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
|
81 |
return ""
|
82 |
|
83 |
-
def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
84 |
"""Extracts information from an Arxiv research paper and generates a summary."""
|
85 |
-
raw_content = get_content(rp_number, node_type)
|
86 |
|
87 |
rp_data = {
|
88 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
@@ -91,6 +90,8 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
91 |
"summary": "Summary not yet generated" # Default summary
|
92 |
}
|
93 |
|
|
|
|
|
94 |
if not raw_content:
|
95 |
logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
|
96 |
return rp_data # Returns default error data
|
@@ -128,30 +129,114 @@ def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
|
|
128 |
if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
|
129 |
rp_data["abstract"] = "Abstract not found on page"
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
response = model.generate_content(prompt)
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
else:
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
except Exception as e:
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
|
157 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|
|
|
80 |
logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
|
81 |
return ""
|
82 |
|
83 |
+
def extract_research_paper_arxiv(rp_number: str, node_type: str = "ResearchPaper") -> dict:
|
84 |
"""Extracts information from an Arxiv research paper and generates a summary."""
|
|
|
85 |
|
86 |
rp_data = {
|
87 |
"document": f"Arxiv {rp_number}", # ID for the paper
|
|
|
90 |
"summary": "Summary not yet generated" # Default summary
|
91 |
}
|
92 |
|
93 |
+
raw_content = get_content(rp_number, node_type)
|
94 |
+
|
95 |
if not raw_content:
|
96 |
logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
|
97 |
return rp_data # Returns default error data
|
|
|
129 |
if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
|
130 |
rp_data["abstract"] = "Abstract not found on page"
|
131 |
|
132 |
+
except Exception as e:
|
133 |
+
logger.error(f"Failed to parse content for Arxiv ID {rp_number}: {e}")
|
134 |
+
|
135 |
+
# Generate summary with Gemini API if available and abstract exists
|
136 |
+
if rp_data["abstract"] and \
|
137 |
+
not rp_data["abstract"].startswith("Error fetching content") and \
|
138 |
+
not rp_data["abstract"].startswith("Abstract not found"):
|
139 |
+
|
140 |
+
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
141 |
+
Focus on challenges, gaps, or novel aspects.
|
142 |
+
Here is the document: <document>{rp_data['abstract']}<document>"""
|
|
|
143 |
|
144 |
+
try:
|
145 |
+
model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
146 |
+
response = model.generate_content(prompt)
|
147 |
+
|
148 |
+
rp_data["summary"] = response.text
|
149 |
+
logger.info(f"Summary generated for Arxiv ID: {rp_number}")
|
150 |
+
except Exception as e:
|
151 |
+
logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
|
152 |
+
rp_data["summary"] = "Error generating summary (API failure)"
|
153 |
+
else:
|
154 |
+
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
155 |
+
return rp_data
|
156 |
+
|
157 |
+
def extract_patent_data(patent_number: str, node_type: str = "Patent"):
|
158 |
+
"""
|
159 |
+
Extracts information from a Google Patents page with robust error handling.
|
160 |
+
"""
|
161 |
+
# Initialize a dictionary with default error messages for consistency.
|
162 |
+
patent_data = {
|
163 |
+
"document": f"Patent {patent_number}",
|
164 |
+
"title": "Error fetching content or content not found",
|
165 |
+
"description": "Error fetching content or content not found",
|
166 |
+
"claim": "Error fetching content or content not found",
|
167 |
+
"summary": "Summary not yet generated" # Default summary
|
168 |
+
}
|
169 |
+
|
170 |
+
# Use the generic get_content function to fetch the raw page content.
|
171 |
+
raw_content = get_content(patent_number, node_type)
|
172 |
+
|
173 |
+
if not raw_content:
|
174 |
+
logger.warning(f"No content fetched for Patent ID: {patent_number}")
|
175 |
+
return patent_data # Return the dictionary with default error messages.
|
176 |
+
|
177 |
+
try:
|
178 |
+
# Let BeautifulSoup handle the decoding from raw bytes.
|
179 |
+
soup = BeautifulSoup(raw_content, 'html.parser')
|
180 |
+
|
181 |
+
# --- Extract Title ---
|
182 |
+
title_tag = soup.find('meta', attrs={'name': 'DC.title'})
|
183 |
+
if title_tag and title_tag.get('content'):
|
184 |
+
patent_data["title"] = title_tag['content'].strip()
|
185 |
else:
|
186 |
+
# Fallback to finding the title in an <h1> tag.
|
187 |
+
title_h1 = soup.find('h1', id='title')
|
188 |
+
if title_h1:
|
189 |
+
patent_data["title"] = title_h1.get_text(strip=True)
|
190 |
+
|
191 |
+
# --- Extract Description ---
|
192 |
+
description_section = soup.find('section', itemprop='description')
|
193 |
+
if description_section:
|
194 |
+
# Remove unnecessary nested spans to clean the output.
|
195 |
+
for src_text in description_section.find_all('span', class_='google-src-text'):
|
196 |
+
src_text.decompose()
|
197 |
+
patent_data["description"] = description_section.get_text(separator=' ', strip=True)
|
198 |
+
|
199 |
+
# --- Extract Claims ---
|
200 |
+
claims_section = soup.find('section', itemprop='claims')
|
201 |
+
if claims_section:
|
202 |
+
# Remove unnecessary nested spans here as well.
|
203 |
+
for src_text in claims_section.find_all('span', class_='google-src-text'):
|
204 |
+
src_text.decompose()
|
205 |
+
patent_data["claim"] = claims_section.get_text(separator=' ', strip=True)
|
206 |
+
|
207 |
+
# Update status message if specific sections were not found on the page.
|
208 |
+
if patent_data["title"] == "Error fetching content or content not found":
|
209 |
+
patent_data["title"] = "Title not found on page"
|
210 |
+
if patent_data["description"] == "Error fetching content or content not found":
|
211 |
+
patent_data["description"] = "Description not found on page"
|
212 |
+
if patent_data["claim"] == "Error fetching content or content not found":
|
213 |
+
patent_data["claim"] = "Claim not found on page"
|
214 |
|
215 |
except Exception as e:
|
216 |
+
# Catch any unexpected errors during the parsing process.
|
217 |
+
logger.error(f"Failed to parse content for Patent ID {patent_number}: {e}")
|
218 |
+
|
219 |
+
# Generate summary with Gemini API if available and abstract exists
|
220 |
+
if rp_data["description"] and \
|
221 |
+
not rp_data["description"].startswith("Error fetching content") and \
|
222 |
+
not rp_data["description"].startswith("Description not found"):
|
223 |
+
|
224 |
+
prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
|
225 |
+
Focus on challenges, gaps, or novel aspects.
|
226 |
+
Here is the document: <document>{rp_data['abstract']}<document>"""
|
227 |
+
|
228 |
+
try:
|
229 |
+
model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
|
230 |
+
response = model.generate_content(prompt)
|
231 |
+
|
232 |
+
rp_data["summary"] = response.text
|
233 |
+
logger.info(f"Summary generated for Patent ID: {patent_number}")
|
234 |
+
except Exception as e:
|
235 |
+
logger.error(f"Error generating summary with Gemini for Patent ID {patent_number}: {e}")
|
236 |
+
rp_data["summary"] = "Error generating summary (API failure)"
|
237 |
+
else:
|
238 |
+
rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
|
239 |
+
return patent_data
|
240 |
|
241 |
def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
|
242 |
"""Adds a list of nodes to Neo4j in a single transaction."""
|