YchKhan commited on
Commit
ec70242
·
verified ·
1 Parent(s): 2817c17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +332 -176
app.py CHANGED
@@ -1,212 +1,368 @@
1
  from fastapi import FastAPI, HTTPException
2
- from fastapi.middleware.cors import CORSMiddleware
3
- from fastapi.responses import StreamingResponse
4
  from pydantic import BaseModel
5
- from typing import List, Dict, Any, Optional
6
- import json
7
  import requests
8
  from bs4 import BeautifulSoup
9
- import fitz # PyMuPDF
10
- import urllib3
11
- import pandas as pd
12
  import io
13
- from duckduckgo_search import DDGS
14
- import re
15
-
16
- app = FastAPI(title="Patent Analyzer API", description="API for patent search and analysis")
17
-
18
- # Enable CORS for frontend
19
- app.add_middleware(
20
- CORSMiddleware,
21
- allow_origins=["*"], # In production, specify your frontend domain
22
- allow_credentials=True,
23
- allow_methods=["*"],
24
- allow_headers=["*"],
25
  )
26
 
27
- # Define data models
28
- class SearchRequest(BaseModel):
29
- query: str
 
30
 
31
- class AnalysisRequest(BaseModel):
32
- patent_background: str
33
- pdf_url: str
 
 
 
34
 
35
- class ExcelExportRequest(BaseModel):
36
- tableData: List[Dict[str, Any]]
37
- userQuery: Optional[str] = None
 
38
 
39
- @app.get("/")
40
- async def root():
41
- return {"message": "Patent Analyzer API is running"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- @app.post("/search")
44
- async def search(query: str, data_type: str = None, max_references: int = 5):
45
- if not query:
46
- raise HTTPException(status_code=400, detail="No query provided")
47
-
48
  try:
49
- if data_type == "pdf" or data_type is None:
50
- search_query = f"{query} filetype:pdf"
51
- elif data_type == "patent":
52
- search_query = f"{query} site:patents.google.com"
 
 
 
 
 
 
 
 
 
 
53
  else:
54
- search_query = query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- results = search_web(search_query, max_references)
57
- return {"results": results}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  except Exception as e:
59
- raise HTTPException(status_code=500, detail=f"Error performing search: {str(e)}")
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- @app.post("/analyze")
63
- async def analyze(request: AnalysisRequest):
64
- if not request.patent_background or not request.pdf_url:
65
- raise HTTPException(status_code=400, detail="Missing required parameters")
 
 
 
 
 
 
 
 
66
 
 
 
 
67
  try:
68
- result = analyze_pdf_novelty(request.patent_background, request.pdf_url)
69
- return {"result": result}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  except Exception as e:
71
- raise HTTPException(status_code=500, detail=f"Error analyzing PDF: {str(e)}")
 
 
 
72
 
73
- @app.post("/export-excel")
74
- async def export_excel(request: ExcelExportRequest):
 
 
 
 
 
 
 
 
 
75
  try:
76
- if not request.tableData:
77
- raise HTTPException(status_code=400, detail="No table data provided")
78
-
79
- # Create pandas DataFrame from the data
80
- df = pd.DataFrame(request.tableData)
 
 
 
81
 
82
- # Get the user query
83
- user_query = request.userQuery or 'No query provided'
 
 
84
 
85
- # Create a BytesIO object to store the Excel file
86
- output = io.BytesIO()
 
 
 
 
87
 
88
- # Create Excel file with xlsxwriter engine
89
- with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
90
- # Write the data to a sheet named 'Results'
91
- df.to_excel(writer, sheet_name='Results', index=False)
92
-
93
- # Get workbook and worksheet objects
94
- workbook = writer.book
95
- worksheet = writer.sheets['Results']
96
-
97
- # Add a sheet for the query
98
- query_sheet = workbook.add_worksheet('Query')
99
- query_sheet.write(0, 0, 'Patent Query')
100
- query_sheet.write(1, 0, user_query)
101
-
102
- # Adjust column widths
103
- for i, col in enumerate(df.columns):
104
- # Get maximum column width
105
- max_len = max(
106
- df[col].astype(str).map(len).max(),
107
- len(col)
108
- ) + 2
109
- # Set column width (limit to 100 to avoid issues)
110
- worksheet.set_column(i, i, min(max_len, 100))
111
 
112
- # Seek to the beginning of the BytesIO object
113
- output.seek(0)
 
 
 
114
 
115
- # Return the Excel file
116
- return StreamingResponse(
117
- output,
118
- media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
119
- headers={"Content-Disposition": "attachment; filename=patent_search_results.xlsx"}
 
120
  )
121
 
 
 
122
  except Exception as e:
123
- raise HTTPException(status_code=500, detail=f"Error exporting Excel: {str(e)}")
124
-
125
- @app.get("/extract-background/")
126
- def extract_background_from_url(url: str):
127
- content = get_content(url)
128
- if content is None:
129
- raise HTTPException(status_code=404, detail="Content not found")
130
- background_section = extract_background(content)
131
- return {"background": background_section}
132
-
133
 
134
- @app.get("/extract-page-text/")
135
- def extract_page_content(url: str, max_char: int = 5000):
 
136
  try:
137
- headers = {
138
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
139
- "Accept": "application/pdf"
140
- }
141
- response = requests.get(url, headers=headers, timeout=20, verify=False)
142
- response.raise_for_status()
143
-
144
- soup = BeautifulSoup(response.text, 'html.parser')
145
- full_text = soup.get_text()
146
- text = re.sub(r'\n+', ' ', full_text)[:max_char]
147
- return {"text_content": text}
148
- except requests.RequestException as e:
149
- return {"error": f"Error fetching the page: {str(e)}"}
150
-
151
- def search_web(topic, max_references):
152
- """Search the web using DuckDuckGo and return results."""
153
- doc_list = []
154
- with DDGS(verify=False) as ddgs:
155
- i = 0
156
- for r in ddgs.text(topic, region='wt-wt', safesearch='On', timelimit='n'):
157
- if i >= max_references:
158
- break
159
- doc_list.append({"title": r['title'], "body": r['body'], "url": r['href']})
160
- i += 1
161
- return doc_list
162
-
163
- def analyze_pdf_novelty(patent_background, pdf_url):
164
- """Extract first page text from PDF and evaluate novelty against patent background"""
165
  try:
166
- # Disable SSL warnings
167
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
168
- headers = {
169
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
170
- "Accept": "application/pdf"
171
- }
172
- # Download PDF
173
- response = requests.get(pdf_url, headers=headers, timeout=10, verify=False)
174
- if response.status_code != 200:
175
- return {"error": f"Failed to download PDF (status code: {response.status_code})"}
176
-
177
- # Extract first page text
178
- try:
179
- pdf_document = fitz.open(stream=response.content, filetype="pdf")
180
- if pdf_document.page_count == 0:
181
- return {"error": "PDF has no pages"}
182
-
183
- first_page = pdf_document.load_page(0)
184
- text = re.sub(r'\n+', ' ', first_page.get_text())
185
-
186
- # Return the extracted text for frontend analysis with OpenAI
187
- # We're not doing the analysis here as it will be done in the frontend
188
  return {
189
- "pdf_text": text,
190
- "score": None,
191
- "justification": None
 
192
  }
193
- except Exception as e:
194
- return {"error": f"Error processing PDF: {str(e)}"}
195
  except Exception as e:
196
- return {"error": f"Error: {str(e)}"}
197
-
198
-
199
- def get_content(url):
200
- response = requests.get(url)
201
- if response.status_code == 200:
202
- return response.content.decode('utf-8').replace("\n", "")
203
- return None
204
-
205
- def extract_background(description):
206
- soup = BeautifulSoup(description, 'html.parser')
207
- section = soup.find('section', itemprop='description', itemscope='')
208
- matches = re.findall(r"background(.*?)(?:summary|description of the drawing)", str(section), re.DOTALL | re.IGNORECASE)
209
- if matches:
210
- clean_text = BeautifulSoup(matches[0], "html.parser").get_text(separator=" ")
211
- return clean_text.strip()
212
- return 'Not found'
 
1
  from fastapi import FastAPI, HTTPException
2
+ from fastapi.responses import JSONResponse
 
3
  from pydantic import BaseModel
 
 
4
  import requests
5
  from bs4 import BeautifulSoup
6
+ import zipfile
 
 
7
  import io
8
+ import os
9
+ import subprocess
10
+ import hashlib
11
+ from pathlib import Path
12
+ from typing import Optional
13
+ import uvicorn
14
+
15
+ app = FastAPI(
16
+ title="3GPP Document Extractor API",
17
+ description="API to extract and read 3GPP specification documents from zip archives",
18
+ version="1.0.0"
 
19
  )
20
 
21
+ # Pydantic models for request/response
22
+ class SpecRequest(BaseModel):
23
+ spec: str
24
+ use_cache: bool = True
25
 
26
+ class DocumentResponse(BaseModel):
27
+ spec: str
28
+ url: str
29
+ content: str
30
+ cached: bool
31
+ content_length: int
32
 
33
+ class LinkResponse(BaseModel):
34
+ spec: str
35
+ url: str
36
+ last_link: str
37
 
38
+ class ErrorResponse(BaseModel):
39
+ error: str
40
+ detail: str
41
+
42
+ def get_last_link_from_3gpp_spec(spec: str) -> Optional[str]:
43
+ """
44
+ Fetches the last clickable link from a 3GPP specification page.
45
+
46
+ Args:
47
+ spec: The specification identifier (e.g., "38.211").
48
+
49
+ Returns:
50
+ The last clickable link URL, or None if not found.
51
+ """
52
+ series = spec.split(".")[0]
53
+ doc_id = spec
54
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/"
55
 
 
 
 
 
 
56
  try:
57
+ response = requests.get(url)
58
+ response.raise_for_status()
59
+
60
+ soup = BeautifulSoup(response.content, 'html.parser')
61
+
62
+ # Find all anchor tags (links)
63
+ links = soup.find_all('a')
64
+
65
+ # Filter out links that are just directory traversals or empty
66
+ clickable_links = [link for link in links if link.get('href') and not link.get('href').startswith('../')]
67
+
68
+ if clickable_links:
69
+ # Return the href of the last clickable link
70
+ return clickable_links[-1].get('href')
71
  else:
72
+ return None
73
+
74
+ except requests.exceptions.RequestException as e:
75
+ print(f"Error fetching the page: {e}")
76
+ return None
77
+
78
+ def extract_and_read_doc_from_zip_url(url: str, cache_dir: str = "document_cache") -> tuple[Optional[str], bool]:
79
+ """
80
+ Downloads a zip file from a URL, extracts the first .docx or .doc file,
81
+ reads its content using LibreOffice via subprocess, and returns the text.
82
+ Uses caching to avoid re-processing the same files.
83
+
84
+ Args:
85
+ url: The URL of the zip file.
86
+ cache_dir: Directory to store cached files.
87
+
88
+ Returns:
89
+ Tuple of (text_content, was_cached) where was_cached indicates if result came from cache.
90
+ """
91
+ try:
92
+ # Create cache directory if it doesn't exist
93
+ cache_path = Path(cache_dir)
94
+ cache_path.mkdir(exist_ok=True)
95
+
96
+ # Create a hash of the URL to use as cache key
97
+ url_hash = hashlib.md5(url.encode()).hexdigest()
98
 
99
+ # Check if cached text file exists
100
+ cached_txt_file = cache_path / f"{url_hash}.txt"
101
+ if cached_txt_file.exists():
102
+ print(f"Found cached version for URL: {url}")
103
+ with open(cached_txt_file, 'r', encoding='utf-8') as f:
104
+ return f.read(), True
105
+
106
+ print(f"No cache found, processing URL: {url}")
107
+
108
+ # Download the zip file
109
+ response = requests.get(url, stream=True)
110
+ response.raise_for_status()
111
+
112
+ # Use a BytesIO object to work with the zip data in memory
113
+ zip_data = io.BytesIO(response.content)
114
+
115
+ with zipfile.ZipFile(zip_data, 'r') as zip_ref:
116
+ for file_info in zip_ref.infolist():
117
+ filename = file_info.filename
118
+ if filename.lower().endswith(('.docx', '.doc')):
119
+ print(f"Found .docx or .doc file: {filename}")
120
+
121
+ # Create a unique filename for the cached document
122
+ file_extension = os.path.splitext(filename)[1]
123
+ cached_doc_file = cache_path / f"{url_hash}{file_extension}"
124
+
125
+ # Extract the file to cache directory
126
+ zip_ref.extract(filename, cache_path)
127
+ extracted_filepath = cache_path / filename
128
+
129
+ # Move to standardized cache filename
130
+ extracted_filepath.rename(cached_doc_file)
131
+
132
+ # Use subprocess to call LibreOffice for conversion
133
+ txt_filename = f"{url_hash}.txt"
134
+ txt_filepath = cache_path / txt_filename
135
+
136
+ try:
137
+ # Run LibreOffice conversion using subprocess
138
+ cmd = [
139
+ "libreoffice",
140
+ "--headless",
141
+ "--convert-to", "txt",
142
+ str(cached_doc_file),
143
+ "--outdir", str(cache_path)
144
+ ]
145
+
146
+ result = subprocess.run(
147
+ cmd,
148
+ capture_output=True,
149
+ text=True,
150
+ timeout=60 # 60 second timeout
151
+ )
152
+
153
+ if result.returncode != 0:
154
+ print(f"LibreOffice conversion failed with return code {result.returncode}")
155
+ print(f"stderr: {result.stderr}")
156
+ return None, False
157
+
158
+ # The converted file will have the same base name as the original
159
+ original_base_name = os.path.splitext(os.path.basename(str(cached_doc_file)))[0]
160
+ converted_txt_file = cache_path / f"{original_base_name}.txt"
161
+
162
+ # Rename to our standardized cache filename if different
163
+ if converted_txt_file != txt_filepath:
164
+ if converted_txt_file.exists():
165
+ converted_txt_file.rename(txt_filepath)
166
+
167
+ # Read the converted text file
168
+ if txt_filepath.exists():
169
+ with open(txt_filepath, 'r', encoding='utf-8') as txt_file:
170
+ text_content = txt_file.read()
171
+
172
+ print(f"Successfully processed and cached document from: {url}")
173
+ return text_content, False
174
+ else:
175
+ print(f"Error: Converted text file not found at {txt_filepath}")
176
+ return None, False
177
+
178
+ except subprocess.TimeoutExpired:
179
+ print("LibreOffice conversion timed out after 60 seconds")
180
+ return None, False
181
+ except FileNotFoundError:
182
+ print("Error: LibreOffice not found. Please ensure LibreOffice is installed and in your PATH.")
183
+ return None, False
184
+ except Exception as e:
185
+ print(f"Error running LibreOffice conversion: {e}")
186
+ return None, False
187
+
188
+ print("No .docx or .doc file found in the zip archive.")
189
+ return None, False
190
+
191
+ except requests.exceptions.RequestException as e:
192
+ print(f"Error downloading or processing the zip file: {e}")
193
+ return None, False
194
+ except zipfile.BadZipFile:
195
+ print("Error: The downloaded file is not a valid zip file.")
196
+ return None, False
197
  except Exception as e:
198
+ print(f"An unexpected error occurred: {e}")
199
+ return None, False
200
 
201
+ # API Endpoints
202
+ @app.get("/")
203
+ async def root():
204
+ """Root endpoint with API information"""
205
+ return {
206
+ "message": "3GPP Document Extractor API",
207
+ "version": "1.0.0",
208
+ "endpoints": {
209
+ "GET /": "API information",
210
+ "GET /spec/{spec}/link": "Get last link for a 3GPP specification",
211
+ "POST /extract": "Extract document content from 3GPP specification",
212
+ "GET /health": "Health check"
213
+ }
214
+ }
215
 
216
+ @app.get("/health")
217
+ async def health_check():
218
+ """Health check endpoint"""
219
+ return {"status": "healthy", "message": "API is running"}
220
+
221
+ @app.get("/spec/{spec}/link", response_model=LinkResponse)
222
+ async def get_spec_link(spec: str):
223
+ """
224
+ Get the last clickable link for a 3GPP specification.
225
+
226
+ Args:
227
+ spec: The specification identifier (e.g., "38.211")
228
 
229
+ Returns:
230
+ LinkResponse with the specification and its last link
231
+ """
232
  try:
233
+ last_link = get_last_link_from_3gpp_spec(spec)
234
+
235
+ if not last_link:
236
+ raise HTTPException(
237
+ status_code=404,
238
+ detail=f"No clickable links found for specification {spec}"
239
+ )
240
+
241
+ # Construct full URL
242
+ series = spec.split(".")[0]
243
+ base_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{spec}/"
244
+ full_url = base_url + last_link
245
+
246
+ return LinkResponse(
247
+ spec=spec,
248
+ url=base_url,
249
+ last_link=full_url
250
+ )
251
+
252
  except Exception as e:
253
+ raise HTTPException(
254
+ status_code=500,
255
+ detail=f"Error processing specification {spec}: {str(e)}"
256
+ )
257
 
258
+ @app.post("/extract", response_model=DocumentResponse)
259
+ async def extract_document(request: SpecRequest):
260
+ """
261
+ Extract and read document content from a 3GPP specification.
262
+
263
+ Args:
264
+ request: SpecRequest containing spec identifier and cache preference
265
+
266
+ Returns:
267
+ DocumentResponse with the extracted content
268
+ """
269
  try:
270
+ # First, get the last link
271
+ last_link = get_last_link_from_3gpp_spec(request.spec)
272
+
273
+ if not last_link:
274
+ raise HTTPException(
275
+ status_code=404,
276
+ detail=f"No clickable links found for specification {request.spec}"
277
+ )
278
 
279
+ # Construct full URL
280
+ series = request.spec.split(".")[0]
281
+ base_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{request.spec}/"
282
+ full_url = base_url + last_link
283
 
284
+ # Check if it's a zip file
285
+ if not full_url.lower().endswith('.zip'):
286
+ raise HTTPException(
287
+ status_code=400,
288
+ detail=f"The last link is not a zip file: {full_url}"
289
+ )
290
 
291
+ # Extract and read the document
292
+ cache_dir = "document_cache" if request.use_cache else None
293
+ content, was_cached = extract_and_read_doc_from_zip_url(full_url, cache_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
+ if not content:
296
+ raise HTTPException(
297
+ status_code=500,
298
+ detail="Could not extract and read the document from the zip file"
299
+ )
300
 
301
+ return DocumentResponse(
302
+ spec=request.spec,
303
+ url=full_url,
304
+ content=content,
305
+ cached=was_cached,
306
+ content_length=len(content)
307
  )
308
 
309
+ except HTTPException:
310
+ raise
311
  except Exception as e:
312
+ raise HTTPException(
313
+ status_code=500,
314
+ detail=f"Error processing specification {request.spec}: {str(e)}"
315
+ )
 
 
 
 
 
 
316
 
317
+ @app.delete("/cache")
318
+ async def clear_cache():
319
+ """Clear all cached files"""
320
  try:
321
+ cache_path = Path("document_cache")
322
+ if cache_path.exists():
323
+ files_deleted = 0
324
+ for file in cache_path.glob("*"):
325
+ if file.is_file():
326
+ file.unlink()
327
+ files_deleted += 1
328
+ return {"message": f"Cache cleared successfully. {files_deleted} files deleted."}
329
+ else:
330
+ return {"message": "Cache directory does not exist."}
331
+ except Exception as e:
332
+ raise HTTPException(
333
+ status_code=500,
334
+ detail=f"Error clearing cache: {str(e)}"
335
+ )
336
+
337
+ @app.get("/cache")
338
+ async def list_cache():
339
+ """List all cached files"""
 
 
 
 
 
 
 
 
 
340
  try:
341
+ cache_path = Path("document_cache")
342
+ if cache_path.exists():
343
+ files = []
344
+ for file in cache_path.glob("*"):
345
+ if file.is_file():
346
+ size = file.stat().st_size
347
+ files.append({
348
+ "name": file.name,
349
+ "size": size,
350
+ "size_mb": round(size / (1024 * 1024), 2)
351
+ })
352
+ return {
353
+ "cache_directory": str(cache_path),
354
+ "total_files": len(files),
355
+ "files": files
356
+ }
357
+ else:
 
 
 
 
 
358
  return {
359
+ "cache_directory": str(cache_path),
360
+ "total_files": 0,
361
+ "files": [],
362
+ "message": "Cache directory does not exist"
363
  }
 
 
364
  except Exception as e:
365
+ raise HTTPException(
366
+ status_code=500,
367
+ detail=f"Error listing cache: {str(e)}"
368
+ )