File size: 18,382 Bytes
fbf2452
 
9cfea04
fbf2452
 
 
9a4fba8
fbf2452
 
 
 
54f5fbc
fbf2452
 
 
 
 
 
 
 
 
 
54f5fbc
 
 
 
 
 
 
 
 
 
 
 
 
9c1f7cf
 
 
54f5fbc
 
 
 
 
 
 
 
 
 
fbf2452
 
 
54f5fbc
 
e7ef7cc
 
fbf2452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54f5fbc
fbf2452
54f5fbc
fbf2452
 
 
 
 
 
 
 
 
ee947c3
fbf2452
 
 
 
 
 
54f5fbc
fbf2452
 
12ab024
 
fbf2452
 
 
 
 
 
 
 
 
 
54f5fbc
 
 
fbf2452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12ab024
 
 
 
 
 
 
 
 
 
 
54f5fbc
12ab024
 
 
 
 
 
 
 
 
 
 
 
 
ee947c3
12ab024
 
 
 
 
2477493
12ab024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbf2452
12ab024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbf2452
 
12ab024
 
 
 
2b14076
 
 
12ab024
 
 
ee947c3
12ab024
 
 
 
 
ee947c3
12ab024
 
 
ee947c3
12ab024
ee947c3
12ab024
fbf2452
54f5fbc
fbf2452
 
 
 
 
 
0db5c80
 
54f5fbc
fbf2452
 
54f5fbc
 
 
 
 
 
 
fbf2452
 
 
 
 
 
54f5fbc
b701828
fbf2452
 
54f5fbc
fbf2452
 
 
 
54f5fbc
 
 
 
fbf2452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee947c3
fbf2452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5053ea5
fbf2452
 
 
 
 
 
 
7c7e8ff
 
fbf2452
 
 
 
 
 
 
ee947c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbf2452
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
import os
import requests
from contextlib import asynccontextmanager
from bs4 import BeautifulSoup
from fastapi import FastAPI, HTTPException
from neo4j import GraphDatabase, basic_auth
import google.generativeai as genai
import logging # Import logging module

# --- Logging Configuration ---
# Basic logger configuration to display INFO messages and above.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__) # Create a logger instance for this module

# --- Environment Variable Configuration ---
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

# Validation of essential configurations
if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
    logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")

# --- Application Lifecycle (Startup/Shutdown) ---
@asynccontextmanager
async def lifespan(app: FastAPI):
    """Handles startup and shutdown events."""
    # Initialize Gemini Client
    logger.info("Initializing Gemini client...")
    if genai:
        try:
            # Assuming GEMINI_API_KEY is set in environment or loaded via settings
            api_key = os.getenv("GEMINI_API_KEY") or getattr(settings, "GEMINI_API_KEY", None)
            if not api_key:
                 raise ValueError("GEMINI_API_KEY not found in environment or settings.")
            else:
                genai.configure(api_key=api_key)
                logger.info("Gemini client configured successfully.")
        except Exception as e:
            logger.error(f"Failed to configure Gemini client: {e}", exc_info=True)
    else:
        logger.warning("Gemini library not imported. Endpoints requiring Gemini will not work.")

    yield # API runs here
    
    # --- Shutdown ---
    logger.info("API shutting down...")
    logger.info("API shutdown complete.")

# Initialize FastAPI application
app = FastAPI(
    title="Neo4j Importer",
    description="API to fetch documents, summarize it with Gemini, and add it to Neo4j.",
    version="1.0.0",
    lifespan=lifespan
)

# --- Utility Functions (Adapted from your script) ---

def get_content(number: str, node_type: str) -> str:
    """Fetches raw HTML content from Arxiv or other sources."""
    redirect_links = {
        "Patent": f"https://patents.google.com/patent/{number}/en",
        "ResearchPaper": f"https://arxiv.org/abs/{number}"
    }

    url = redirect_links.get(node_type)
    if not url:
        logger.warning(f"Unknown node type: {node_type} for number {number}")
        return ""
        
    try:
        response = requests.get(url)
        response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
        return response.content.decode('utf-8', errors='replace').replace("\n", "")
    except requests.exceptions.RequestException as e:
        logger.error(f"Request error for {node_type} number: {number} at URL {url}: {e}")
        return ""
    except Exception as e:
        logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
        return ""

def extract_arxiv(rp_number: str, node_type: str = "ResearchPaper") -> dict:
    """Extracts information from an Arxiv research paper and generates a summary."""
    
    rp_data = {
        "document": f"Arxiv {rp_number}", # ID for the paper
        "title": "Error fetching content or content not found",
        "abstract": "Error fetching content or content not found",
        "summary": "Summary not yet generated" # Default summary
    }

    raw_content = get_content(rp_number, node_type)

    if not raw_content:
        logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
        return rp_data # Returns default error data

    try:
        soup = BeautifulSoup(raw_content, 'html.parser')

        # Extract Title
        title_tag = soup.find('h1', class_='title')
        if title_tag and title_tag.find('span', class_='descriptor'):
            title_text = title_tag.find('span', class_='descriptor').next_sibling
            if title_text and isinstance(title_text, str):
                 rp_data["title"] = title_text.strip()
            else: 
                rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
        elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
             rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()


        # Extract Abstract
        abstract_tag = soup.find('blockquote', class_='abstract')
        if abstract_tag:
            abstract_text = abstract_tag.get_text(strip=True)
            if abstract_text.lower().startswith('abstract'): # Check if "abstract" (case-insensitive) is at the beginning
                # Find the first occurrence of ':' after "abstract" or just remove "abstract" prefix
                prefix_end = abstract_text.lower().find('abstract') + len('abstract')
                if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
                    prefix_end += 1 # Include the colon in removal
                abstract_text = abstract_text[prefix_end:].strip()
            rp_data["abstract"] = abstract_text
        
        # Mark if title or abstract are still not found
        if rp_data["title"] == "Error fetching content or content not found" and not title_tag:
            rp_data["title"] = "Title not found on page"
        if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
            rp_data["abstract"] = "Abstract not found on page"

    except Exception as e:
        logger.error(f"Failed to parse content for Arxiv ID {rp_number}: {e}")

    # Generate summary with Gemini API if available and abstract exists
    if rp_data["abstract"] and \
       not rp_data["abstract"].startswith("Error fetching content") and \
       not rp_data["abstract"].startswith("Abstract not found"):
        
        prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
        Focus on challenges, gaps, or novel aspects.
        Here is the document: <document>{rp_data['abstract']}<document>"""
        
        try:
            model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
            response = model.generate_content(prompt)
    
            rp_data["summary"] = response.text
            logger.info(f"Summary generated for Arxiv ID: {rp_number}")
        except Exception as e:
            logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
            rp_data["summary"] = "Error generating summary (API failure)"
    else:
        rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"
    return rp_data

def extract_google_patents(patent_number: str, node_type: str = "Patent"):
    """
    Extracts information from a Google Patents page with robust error handling.
    """
    # Initialize a dictionary with default error messages for consistency.
    patent_data = {
        "number": f"{patent_number}",
        "title": "Error fetching content or content not found",
        "description": "Error fetching content or content not found",
        "claim": "Error fetching content or content not found",
        "summary": "Summary not yet generated" # Default summary
    }

    # Use the generic get_content function to fetch the raw page content.
    raw_content = get_content(patent_number, node_type)

    if not raw_content:
        logger.warning(f"No content fetched for Patent ID: {patent_number}")
        return patent_data # Return the dictionary with default error messages.

    try:
        # Let BeautifulSoup handle the decoding from raw bytes.
        soup = BeautifulSoup(raw_content, 'html.parser')

        # --- Extract Title ---
        title_tag = soup.find('meta', attrs={'name': 'DC.title'})
        if title_tag and title_tag.get('content'):
            patent_data["title"] = title_tag['content'].strip()
        else:
            # Fallback to finding the title in an <h1> tag.
            title_h1 = soup.find('h1', id='title')
            if title_h1:
                patent_data["title"] = title_h1.get_text(strip=True)

        # --- Extract Description ---
        description_section = soup.find('section', itemprop='description')
        if description_section:
            # Remove unnecessary nested spans to clean the output.
            for src_text in description_section.find_all('span', class_='google-src-text'):
                src_text.decompose()
            patent_data["description"] = description_section.get_text(separator=' ', strip=True)

        # --- Extract Claims ---
        claims_section = soup.find('section', itemprop='claims')
        if claims_section:
            # Remove unnecessary nested spans here as well.
            for src_text in claims_section.find_all('span', class_='google-src-text'):
                src_text.decompose()
            patent_data["claim"] = claims_section.get_text(separator=' ', strip=True)

        # Update status message if specific sections were not found on the page.
        if patent_data["title"] == "Error fetching content or content not found":
            patent_data["title"] = "Title not found on page"
        if patent_data["description"] == "Error fetching content or content not found":
            patent_data["description"] = "Description not found on page"
        if patent_data["claim"] == "Error fetching content or content not found":
            patent_data["claim"] = "Claim not found on page"

    except Exception as e:
        # Catch any unexpected errors during the parsing process.
        logger.error(f"Failed to parse content for Patent ID {patent_number}: {e}")

    # Generate summary with Gemini API if available and abstract exists
    if patent_data["description"] and \
       not patent_data["description"].startswith("Error fetching content") and \
       not patent_data["description"].startswith("Description not found"):
        
        prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
        Focus on challenges, gaps, or novel aspects.
        Here is the document: <document>{patent_data['description']}<document>"""
        
        try:
            model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
            response = model.generate_content(prompt)
    
            patent_data["summary"] = response.text
            logger.info(f"Summary generated for Patent ID: {patent_number}")
        except Exception as e:
            logger.error(f"Error generating summary with Gemini for Patent ID {patent_number}: {e}")
            patent_data["summary"] = "Error generating summary (API failure)"
    else:
        rp_data["summary"] = "Summary not generated (Description unavailable or problematic)"
    return patent_data

def add_nodes_to_neo4j(driver, data_list: list, node_type: str):
    """Adds a list of nodes to Neo4j in a single transaction."""
    if not data_list:
        logger.warning("No data provided to add_nodes_to_neo4j.")
        return 0

    query = (
        "UNWIND $data as properties "
        f"CREATE (n:{node_type}) "
        "SET n = properties"
    )

    # query = (
    #     f"UNWIND $data as properties "
    #     f"MERGE (n:{node_type} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
    #     f"ON CREATE SET n = properties "
    #     f"ON MATCH SET n += properties" # Update properties if the node already exists
    # )

    try:
        with driver.session(database="neo4j") as session: # Specify database if not default
            result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
            nodes_created = result.counters.nodes_created
            
            if nodes_created > 0:
                logger.info(f"{nodes_created} new {node_type} node(s) added successfully.")
                
            return nodes_created # Return the number of nodes actually created
    except Exception as e:
        logger.error(f"Neo4j Error - Failed to add/update {node_type} nodes: {e}")
        raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")


# --- FastAPI Endpoint ---
# API state check route
@app.get("/")
def read_root():
    return {"status": "ok"}

@app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
async def add_single_research_paper(arxiv_id: str):
    """
    Fetches a research paper from Arxiv by its ID, extracts information,
    generates a summary, and adds/updates it as a 'ResearchPaper' node in Neo4j.
    """
    node_type = "ResearchPaper"
    logger.info(f"Processing request for Arxiv ID: {arxiv_id}")

    if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
        logger.error("Neo4j database connection details are not configured on the server.")
        raise HTTPException(status_code=500, detail="Neo4j database connection details are not configured on the server.")

    # Step 1: Extract paper data
    paper_data = extract_arxiv(arxiv_id, node_type)

    if paper_data["title"].startswith("Error fetching content") or paper_data["title"] == "Title not found on page":
        logger.warning(f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}")
        raise HTTPException(status_code=404, detail=f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}")

    # Step 2: Add/Update in Neo4j
    driver_instance = None # Initialize for the finally block
    try:
        auth_token = basic_auth(NEO4J_USER, NEO4J_PASSWORD)
        driver_instance = GraphDatabase.driver(NEO4J_URI, auth=auth_token)
        driver_instance.verify_connectivity()
        logger.info("Successfully connected to Neo4j.")
        
        nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)
            
        if nodes_created_count > 0 :
            logger.info(f"Research paper {arxiv_id} was successfully added to Neo4j.")
            status_code_response = 201 # Created
        
        # Note: FastAPI uses the status_code from the decorator or HTTPException.
        # This custom status_code_response is for the JSON body if needed, but the actual HTTP response status
        # will be 201 (from decorator) unless an HTTPException overrides it or we change the decorator based on logic.
        # For simplicity here, we'll return it in the body and let the decorator's 201 stand if no error.
        # A more advanced setup might change the response status dynamically.
        
        return {"data": paper_data}

    except HTTPException as e: # Re-raise HTTPExceptions
        logger.error(f"HTTPException during Neo4j operation for {arxiv_id}: {e.detail}")
        raise e
    except Exception as e:
        logger.error(f"An unexpected error occurred during Neo4j operation for {arxiv_id}: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}")
    finally:
        if driver_instance:
            driver_instance.close()
            logger.info("Neo4j connection closed.")


@app.post("/add_patent/{patent_id}", status_code=201) # 201 Created for successful creation
async def add_single_patent(patent_id: str):
    """
    Fetches a patent from Google Patents by its ID, extracts information,
    generates a summary, and adds/updates it as a 'Patent' node in Neo4j.
    """
    node_type = "Patent"
    logger.info(f"Processing request for Patent ID: {patent_id}")

    if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
        logger.error("Neo4j database connection details are not configured on the server.")
        raise HTTPException(status_code=500, detail="Neo4j database connection details are not configured on the server.")

    # Step 1: Extract patent data
    patent_data = extract_google_patents(patent_id, node_type)

    if patent_data["title"].startswith("Error fetching content") or patent_data["title"] == "Title not found on page":
        logger.warning(f"Could not fetch or parse content for Patent ID {patent_id}. Title: {patent_data['title']}")
        raise HTTPException(status_code=404, detail=f"Could not fetch or parse content for Patent ID {patent_id}. Title: {patent_data['title']}")

    # Step 2: Add/Update in Neo4j
    driver_instance = None # Initialize for the finally block
    try:
        auth_token = basic_auth(NEO4J_USER, NEO4J_PASSWORD)
        driver_instance = GraphDatabase.driver(NEO4J_URI, auth=auth_token)
        driver_instance.verify_connectivity()
        logger.info("Successfully connected to Neo4j.")
        
        nodes_created_count = add_nodes_to_neo4j(driver_instance, [patent_data], node_type)
            
        if nodes_created_count > 0 :
            logger.info(f"Patent {patent_id} was successfully added to Neo4j.")
            status_code_response = 201 # Created
        
        # Note: FastAPI uses the status_code from the decorator or HTTPException.
        # This custom status_code_response is for the JSON body if needed, but the actual HTTP response status
        # will be 201 (from decorator) unless an HTTPException overrides it or we change the decorator based on logic.
        # For simplicity here, we'll return it in the body and let the decorator's 201 stand if no error.
        # A more advanced setup might change the response status dynamically.
        
        return {"data": patent_data}

    except HTTPException as e: # Re-raise HTTPExceptions
        logger.error(f"HTTPException during Neo4j operation for {patent_id}: {e.detail}")
        raise e
    except Exception as e:
        logger.error(f"An unexpected error occurred during Neo4j operation for {patent_id}: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}")
    finally:
        if driver_instance:
            driver_instance.close()
            logger.info("Neo4j connection closed.")