Spaces:

YchKhan
/

Ptt_Endpoints

Sleeping

App Files Files Community

YchKhan commited on Jul 2

Commit

ec70242

verified ·

1 Parent(s): 2817c17

Update app.py

Browse files

Files changed (1) hide show

app.py +332 -176

app.py CHANGED Viewed

@@ -1,212 +1,368 @@
 from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
-from typing import List, Dict, Any, Optional
-import json
 import requests
 from bs4 import BeautifulSoup
-import fitz  # PyMuPDF
-import urllib3
-import pandas as pd
 import io
-from duckduckgo_search import DDGS
-import re
-app = FastAPI(title="Patent Analyzer API", description="API for patent search and analysis")
-# Enable CORS for frontend
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # In production, specify your frontend domain
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-# Define data models
-class SearchRequest(BaseModel):
-    query: str
-class AnalysisRequest(BaseModel):
-    patent_background: str
-    pdf_url: str
-class ExcelExportRequest(BaseModel):
-    tableData: List[Dict[str, Any]]
-    userQuery: Optional[str] = None
-@app.get("/")
-async def root():
-    return {"message": "Patent Analyzer API is running"}
-@app.post("/search")
-async def search(query: str, data_type: str = None, max_references: int = 5):
-    if not query:
-        raise HTTPException(status_code=400, detail="No query provided")
     try:
-        if data_type == "pdf" or data_type is None:
-            search_query = f"{query} filetype:pdf"
-        elif data_type == "patent":
-            search_query = f"{query} site:patents.google.com"
         else:
-            search_query = query
-        results = search_web(search_query, max_references)
-        return {"results": results}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error performing search: {str(e)}")
-@app.post("/analyze")
-async def analyze(request: AnalysisRequest):
-    if not request.patent_background or not request.pdf_url:
-        raise HTTPException(status_code=400, detail="Missing required parameters")
     try:
-        result = analyze_pdf_novelty(request.patent_background, request.pdf_url)
-        return {"result": result}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error analyzing PDF: {str(e)}")
-@app.post("/export-excel")
-async def export_excel(request: ExcelExportRequest):
     try:
-        if not request.tableData:
-            raise HTTPException(status_code=400, detail="No table data provided")
-        # Create pandas DataFrame from the data
-        df = pd.DataFrame(request.tableData)
-        # Get the user query
-        user_query = request.userQuery or 'No query provided'
-        # Create a BytesIO object to store the Excel file
-        output = io.BytesIO()
-        # Create Excel file with xlsxwriter engine
-        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
-            # Write the data to a sheet named 'Results'
-            df.to_excel(writer, sheet_name='Results', index=False)
-            # Get workbook and worksheet objects
-            workbook = writer.book
-            worksheet = writer.sheets['Results']
-            # Add a sheet for the query
-            query_sheet = workbook.add_worksheet('Query')
-            query_sheet.write(0, 0, 'Patent Query')
-            query_sheet.write(1, 0, user_query)
-            # Adjust column widths
-            for i, col in enumerate(df.columns):
-                # Get maximum column width
-                max_len = max(
-                    df[col].astype(str).map(len).max(),
-                    len(col)
-                ) + 2
-                # Set column width (limit to 100 to avoid issues)
-                worksheet.set_column(i, i, min(max_len, 100))
-        # Seek to the beginning of the BytesIO object
-        output.seek(0)
-        # Return the Excel file
-        return StreamingResponse(
-            output,
-            media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-            headers={"Content-Disposition": "attachment; filename=patent_search_results.xlsx"}
         )
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error exporting Excel: {str(e)}")
-@app.get("/extract-background/")
-def extract_background_from_url(url: str):
-    content = get_content(url)
-    if content is None:
-        raise HTTPException(status_code=404, detail="Content not found")
-    background_section = extract_background(content)
-    return {"background": background_section}
-@app.get("/extract-page-text/")
-def extract_page_content(url: str, max_char: int = 5000):
     try:
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-            "Accept": "application/pdf"
-        }
-        response = requests.get(url, headers=headers, timeout=20, verify=False)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        full_text = soup.get_text()
-        text = re.sub(r'\n+', ' ', full_text)[:max_char]
-        return {"text_content": text}
-    except requests.RequestException as e:
-        return {"error": f"Error fetching the page: {str(e)}"}
-def search_web(topic, max_references):
-    """Search the web using DuckDuckGo and return results."""
-    doc_list = []
-    with DDGS(verify=False) as ddgs:
-        i = 0
-        for r in ddgs.text(topic, region='wt-wt', safesearch='On', timelimit='n'):
-            if i >= max_references:
-                break
-            doc_list.append({"title": r['title'], "body": r['body'], "url": r['href']})
-            i += 1
-    return doc_list
-def analyze_pdf_novelty(patent_background, pdf_url):
-    """Extract first page text from PDF and evaluate novelty against patent background"""
     try:
-        # Disable SSL warnings
-        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-            "Accept": "application/pdf"
-        }
-        # Download PDF
-        response = requests.get(pdf_url, headers=headers, timeout=10, verify=False)
-        if response.status_code != 200:
-            return {"error": f"Failed to download PDF (status code: {response.status_code})"}
-        # Extract first page text
-        try:
-            pdf_document = fitz.open(stream=response.content, filetype="pdf")
-            if pdf_document.page_count == 0:
-                return {"error": "PDF has no pages"}
-            first_page = pdf_document.load_page(0)
-            text = re.sub(r'\n+', ' ', first_page.get_text())
-            # Return the extracted text for frontend analysis with OpenAI
-            # We're not doing the analysis here as it will be done in the frontend
             return {
-                "pdf_text": text,
-                "score": None,
-                "justification": None
             }
-        except Exception as e:
-            return {"error": f"Error processing PDF: {str(e)}"}
     except Exception as e:
-        return {"error": f"Error: {str(e)}"}
-def get_content(url):
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.content.decode('utf-8').replace("\n", "")
-    return None
-def extract_background(description):
-    soup = BeautifulSoup(description, 'html.parser')
-    section = soup.find('section', itemprop='description', itemscope='')
-    matches = re.findall(r"background(.*?)(?:summary|description of the drawing)", str(section), re.DOTALL | re.IGNORECASE)
-    if matches:
-        clean_text = BeautifulSoup(matches[0], "html.parser").get_text(separator=" ")
-        return clean_text.strip()
-    return 'Not found'

 from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 import requests
 from bs4 import BeautifulSoup
+import zipfile
 import io
+import os
+import subprocess
+import hashlib
+from pathlib import Path
+from typing import Optional
+import uvicorn
+app = FastAPI(
+    title="3GPP Document Extractor API",
+    description="API to extract and read 3GPP specification documents from zip archives",
+    version="1.0.0"
 )
+# Pydantic models for request/response
+class SpecRequest(BaseModel):
+    spec: str
+    use_cache: bool = True
+class DocumentResponse(BaseModel):
+    spec: str
+    url: str
+    content: str
+    cached: bool
+    content_length: int
+class LinkResponse(BaseModel):
+    spec: str
+    url: str
+    last_link: str
+class ErrorResponse(BaseModel):
+    error: str
+    detail: str
+def get_last_link_from_3gpp_spec(spec: str) -> Optional[str]:
+    """
+    Fetches the last clickable link from a 3GPP specification page.
+    Args:
+        spec: The specification identifier (e.g., "38.211").
+    Returns:
+        The last clickable link URL, or None if not found.
+    """
+    series = spec.split(".")[0]
+    doc_id = spec
+    url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/"
     try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Find all anchor tags (links)
+        links = soup.find_all('a')
+        # Filter out links that are just directory traversals or empty
+        clickable_links = [link for link in links if link.get('href') and not link.get('href').startswith('../')]
+        if clickable_links:
+            # Return the href of the last clickable link
+            return clickable_links[-1].get('href')
         else:
+            return None
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching the page: {e}")
+        return None
+def extract_and_read_doc_from_zip_url(url: str, cache_dir: str = "document_cache") -> tuple[Optional[str], bool]:
+    """
+    Downloads a zip file from a URL, extracts the first .docx or .doc file,
+    reads its content using LibreOffice via subprocess, and returns the text.
+    Uses caching to avoid re-processing the same files.
+    Args:
+        url: The URL of the zip file.
+        cache_dir: Directory to store cached files.
+    Returns:
+        Tuple of (text_content, was_cached) where was_cached indicates if result came from cache.
+    """
+    try:
+        # Create cache directory if it doesn't exist
+        cache_path = Path(cache_dir)
+        cache_path.mkdir(exist_ok=True)
+        # Create a hash of the URL to use as cache key
+        url_hash = hashlib.md5(url.encode()).hexdigest()
+        # Check if cached text file exists
+        cached_txt_file = cache_path / f"{url_hash}.txt"
+        if cached_txt_file.exists():
+            print(f"Found cached version for URL: {url}")
+            with open(cached_txt_file, 'r', encoding='utf-8') as f:
+                return f.read(), True
+        print(f"No cache found, processing URL: {url}")
+        # Download the zip file
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        # Use a BytesIO object to work with the zip data in memory
+        zip_data = io.BytesIO(response.content)
+        with zipfile.ZipFile(zip_data, 'r') as zip_ref:
+            for file_info in zip_ref.infolist():
+                filename = file_info.filename
+                if filename.lower().endswith(('.docx', '.doc')):
+                    print(f"Found .docx or .doc file: {filename}")
+                    # Create a unique filename for the cached document
+                    file_extension = os.path.splitext(filename)[1]
+                    cached_doc_file = cache_path / f"{url_hash}{file_extension}"
+                    # Extract the file to cache directory
+                    zip_ref.extract(filename, cache_path)
+                    extracted_filepath = cache_path / filename
+                    # Move to standardized cache filename
+                    extracted_filepath.rename(cached_doc_file)
+                    # Use subprocess to call LibreOffice for conversion
+                    txt_filename = f"{url_hash}.txt"
+                    txt_filepath = cache_path / txt_filename
+                    try:
+                        # Run LibreOffice conversion using subprocess
+                        cmd = [
+                            "libreoffice",
+                            "--headless",
+                            "--convert-to", "txt",
+                            str(cached_doc_file),
+                            "--outdir", str(cache_path)
+                        ]
+                        result = subprocess.run(
+                            cmd,
+                            capture_output=True,
+                            text=True,
+                            timeout=60  # 60 second timeout
+                        )
+                        if result.returncode != 0:
+                            print(f"LibreOffice conversion failed with return code {result.returncode}")
+                            print(f"stderr: {result.stderr}")
+                            return None, False
+                        # The converted file will have the same base name as the original
+                        original_base_name = os.path.splitext(os.path.basename(str(cached_doc_file)))[0]
+                        converted_txt_file = cache_path / f"{original_base_name}.txt"
+                        # Rename to our standardized cache filename if different
+                        if converted_txt_file != txt_filepath:
+                            if converted_txt_file.exists():
+                                converted_txt_file.rename(txt_filepath)
+                        # Read the converted text file
+                        if txt_filepath.exists():
+                            with open(txt_filepath, 'r', encoding='utf-8') as txt_file:
+                                text_content = txt_file.read()
+                            print(f"Successfully processed and cached document from: {url}")
+                            return text_content, False
+                        else:
+                            print(f"Error: Converted text file not found at {txt_filepath}")
+                            return None, False
+                    except subprocess.TimeoutExpired:
+                        print("LibreOffice conversion timed out after 60 seconds")
+                        return None, False
+                    except FileNotFoundError:
+                        print("Error: LibreOffice not found. Please ensure LibreOffice is installed and in your PATH.")
+                        return None, False
+                    except Exception as e:
+                        print(f"Error running LibreOffice conversion: {e}")
+                        return None, False
+            print("No .docx or .doc file found in the zip archive.")
+            return None, False
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading or processing the zip file: {e}")
+        return None, False
+    except zipfile.BadZipFile:
+        print("Error: The downloaded file is not a valid zip file.")
+        return None, False
     except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return None, False
+# API Endpoints
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "message": "3GPP Document Extractor API",
+        "version": "1.0.0",
+        "endpoints": {
+            "GET /": "API information",
+            "GET /spec/{spec}/link": "Get last link for a 3GPP specification",
+            "POST /extract": "Extract document content from 3GPP specification",
+            "GET /health": "Health check"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "message": "API is running"}
+@app.get("/spec/{spec}/link", response_model=LinkResponse)
+async def get_spec_link(spec: str):
+    """
+    Get the last clickable link for a 3GPP specification.
+    Args:
+        spec: The specification identifier (e.g., "38.211")
+    Returns:
+        LinkResponse with the specification and its last link
+    """
     try:
+        last_link = get_last_link_from_3gpp_spec(spec)
+        if not last_link:
+            raise HTTPException(
+                status_code=404,
+                detail=f"No clickable links found for specification {spec}"
+            )
+        # Construct full URL
+        series = spec.split(".")[0]
+        base_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{spec}/"
+        full_url = base_url + last_link
+        return LinkResponse(
+            spec=spec,
+            url=base_url,
+            last_link=full_url
+        )
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error processing specification {spec}: {str(e)}"
+        )
+@app.post("/extract", response_model=DocumentResponse)
+async def extract_document(request: SpecRequest):
+    """
+    Extract and read document content from a 3GPP specification.
+    Args:
+        request: SpecRequest containing spec identifier and cache preference
+    Returns:
+        DocumentResponse with the extracted content
+    """
     try:
+        # First, get the last link
+        last_link = get_last_link_from_3gpp_spec(request.spec)
+        if not last_link:
+            raise HTTPException(
+                status_code=404,
+                detail=f"No clickable links found for specification {request.spec}"
+            )
+        # Construct full URL
+        series = request.spec.split(".")[0]
+        base_url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{request.spec}/"
+        full_url = base_url + last_link
+        # Check if it's a zip file
+        if not full_url.lower().endswith('.zip'):
+            raise HTTPException(
+                status_code=400,
+                detail=f"The last link is not a zip file: {full_url}"
+            )
+        # Extract and read the document
+        cache_dir = "document_cache" if request.use_cache else None
+        content, was_cached = extract_and_read_doc_from_zip_url(full_url, cache_dir)
+        if not content:
+            raise HTTPException(
+                status_code=500,
+                detail="Could not extract and read the document from the zip file"
+            )
+        return DocumentResponse(
+            spec=request.spec,
+            url=full_url,
+            content=content,
+            cached=was_cached,
+            content_length=len(content)
         )
+    except HTTPException:
+        raise
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error processing specification {request.spec}: {str(e)}"
+        )
+@app.delete("/cache")
+async def clear_cache():
+    """Clear all cached files"""
     try:
+        cache_path = Path("document_cache")
+        if cache_path.exists():
+            files_deleted = 0
+            for file in cache_path.glob("*"):
+                if file.is_file():
+                    file.unlink()
+                    files_deleted += 1
+            return {"message": f"Cache cleared successfully. {files_deleted} files deleted."}
+        else:
+            return {"message": "Cache directory does not exist."}
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error clearing cache: {str(e)}"
+        )
+@app.get("/cache")
+async def list_cache():
+    """List all cached files"""
     try:
+        cache_path = Path("document_cache")
+        if cache_path.exists():
+            files = []
+            for file in cache_path.glob("*"):
+                if file.is_file():
+                    size = file.stat().st_size
+                    files.append({
+                        "name": file.name,
+                        "size": size,
+                        "size_mb": round(size / (1024 * 1024), 2)
+                    })
+            return {
+                "cache_directory": str(cache_path),
+                "total_files": len(files),
+                "files": files
+            }
+        else:
             return {
+                "cache_directory": str(cache_path),
+                "total_files": 0,
+                "files": [],
+                "message": "Cache directory does not exist"
             }
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error listing cache: {str(e)}"
+        )